In [1]:
from math import *
import pandas as pd
import numpy as np
import warnings
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import Imputer, StandardScaler
import matplotlib.pyplot as plt
import random
import statistics
from IPython.display import Image
import lightgbm as lgb
import codecs

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
warnings.filterwarnings('ignore')

In [2]:
def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

In [3]:
def back_exclusion(indep, indep_test, dep_test, indep_train, dep_train, result, depth):
    rmse_scorer = make_scorer(rmse, greater_is_better=False)
    features_to_test = indep.columns
    allfeatures = [x for x in features_to_test]
    gsel = lgb.LGBMRegressor(objective = 'regression',  
                       max_depth = depth,
                       learning_rate = 0.077, lambda_l1 = 0.0009, lambda_l2 = 0.0064,
                       n_estimators = 86, n_jobs=10, num_leaves=17)
    gsel.fit(indep_train.loc[:,allfeatures], dep_train)
    current_rmse =  -cross_val_score(gsel, indep_test, dep_test, scoring=rmse_scorer, cv=5).mean()
#           

    print ("Начальный rmse: ",current_rmse)
    deleted = []
    while (True):
        max_rmse = current_rmse
        for feature in allfeatures:
            print ("Проверяется: ", feature)
            featureToTest = [x for x in allfeatures if  x != feature]
            gsel = lgb.LGBMRegressor(objective = 'regression',
                       max_depth = depth,
                       learning_rate = 0.068, lambda_l1 = 0.0009, lambda_l2 = 0.0064,
                       n_estimators = 95, n_jobs=10, num_leaves=17, min_child_samples = 10)
            gsel.fit(indep_train.loc[:,featureToTest], dep_train, eval_metric='rmse')
            r = -cross_val_score(gsel, indep_test, dep_test, scoring=rmse_scorer, cv=5).mean()
#            r = rmse(y_true=dep_test, y_pred=gsel.predict(indep_test.loc[:,featureToTest]))
            if (max_rmse > r):
                max_rmse = r
                badfeature = feature
                print("Наихудшая фича: ", badfeature, r)
        if (max_rmse >= current_rmse):
            break
        current_rmse = max_rmse
        print ("Текущий rmse: ", current_rmse)
        allfeatures.remove(badfeature)
        print("Удалено: ", badfeature)
        deleted.append(badfeature)
    indep =  indep.loc[:,allfeatures]
    indep_test =  indep_test.loc[:,allfeatures]
    indep_train =  indep_train.loc[:,allfeatures]
    result =  result.loc[:,allfeatures]

    print ("Итого удалено: ", deleted)
    return (indep, indep_test, indep_train, result)

In [4]:
def make_models(dep, indep, result, depth, folds=5):
    models = []
    for i in range(0, folds):
        indep_train, indep_test, dep_train, dep_test = train_test_split(indep, dep, test_size=0.40, random_state=32+i)
        gs1 = lgb.LGBMRegressor(oobjective = 'regression',
                       max_depth = depth,
                       learning_rate = 0.068, lambda_l1 = 0.0009, lambda_l2 = 0.0064,
                       n_estimators = 95, n_jobs=10, num_leaves=17, min_child_samples = 10)
        gs1.fit(indep_train, dep_train, eval_set=[(indep_test, dep_test)], eval_metric='rmse', early_stopping_rounds=5, verbose=False);
        models.append(gs1)

    pred = pd.DataFrame()    
    pred2 = pd.DataFrame()    
    res = pd.DataFrame()    

    for i in range(0, folds):
        model=models[i]
        pred["p"+str(i)]=model.predict(indep_test)
        pred2["p"+str(i)]=model.predict(indep_train)
        res["r"+str(i)]=model.predict(result)

    pmean = pred.mean(axis=1)
    pmean2 = pred2.mean(axis=1)
    rmean = res.mean(axis=1)
    print(rmse(y_true=dep_test, y_pred=pmean), rmse(y_true=dep_train, y_pred=pmean2), folds) 
    return (pmean, pmean2, rmean)

In [5]:
def prepare_data(d):
    c_columns = d.columns
    c_index = d.index

    imp = Imputer(strategy="most_frequent")

    d = pd.DataFrame(imp.fit_transform(d))
    d.columns = c_columns
    d.index = c_index

    predict = d[d["is_test"]==True]
    train = d[d["is_test"]==False]

    dep = train.loc[:,["target"]]
    indep = train.iloc[:,3:]
    result = predict.iloc[:,3:]
    result_index = predict.iloc[:,0]
    return (dep, indep, result, result_index)

In [6]:
atm_data =pd.read_csv(r"\atm_features_no_outliers.csv",";", engine="python")
atm_data = atm_data.sort_values(["is_test", "original_index"])

In [7]:
features = ['original_index','target', 'is_test','region_type', 'status', 'federal', 'dist_to_center', 'population',
       'home_atm_in_range', 'objects_in_range',
       'closest_category', 'closest_place','shops_in_range', 'tourism_in_range',
       'amenity_in_range', 'public_transport_in_range',
       'dist_to_closest_transport', 'dist_to_closest_tourism',
       'dist_to_closest_shop', 
       'dist_to_closest_object',
       'banks_in_range', 'dist_to_closest_parking',
       'large_shops_in_range', 'market_in_range',  'atm_group',
       'lat', 
        'long', 
        'city_center_lat', 'city_center_long',
        'dist_to_closest_place_cafe',
        'dist_to_closest_place_clinic',
        'dist_to_closest_place_convenience',
        'dist_to_closest_place_department_store',
        'dist_to_closest_place_hospital',
        'dist_to_closest_place_hotel',
        'dist_to_closest_place_kindergarten',
        'dist_to_closest_place_kiosk',
        'dist_to_closest_place_mall',
        'dist_to_closest_place_marketplace',
        'dist_to_closest_place_place_of_worship',
        'dist_to_closest_place_public_building',
        'dist_to_closest_place_school',
        'dist_to_closest_place_supermarket',
        'dist_to_closest_place_townhall',
        'dist_to_closest_place_university',
        'dist_to_closest_atm1',
        'dist_to_closest_atm2',
        'dist_to_closest_atm3',
        'dist_to_closest_atm4',
        'dist_to_closest_atm5',
        'dist_to_closest_atm_average',
        'post_index',
        'count_on_same_address',
        'count_in_same_city',
        'city',
        'closest_levels',
        'average_levels',
        'max_levels',
        'dist_to_closest_building',
        'dist_to_closest_high_building',
        'count_of_buildings',
        'average_levels_of_closest_buildings',
            'people_on_atm'
           ]

In [8]:
rare_place = atm_data["closest_place"].value_counts()[atm_data["closest_place"].value_counts() < 50].index
atm_data["closest_place"] = atm_data["closest_place"].apply(lambda x: 'rare' if x in rare_place else x)

rare_city = atm_data["city"].value_counts()[atm_data["city"].value_counts() < 20].index
atm_data["city"] = atm_data["city"].apply(lambda x: 'rare' if x in rare_place else x)

data =atm_data.loc[:,features]
data["closest_place"]= data["closest_place"].rank().fillna(-1)
data["city"]= data["city"].rank().fillna(-1)
data["post_index"]= data["post_index"].rank().fillna(-1)
data["closest_category"]= data["closest_category"].rank().fillna(-1)
data["federal"]= data["federal"].rank().fillna(-1)
data["region_type"]= data["region_type"].rank().fillna(-1)
data["lat"]=np.power(data["lat"],2)
data["people_on_atm"] = data["population"] / data["count_in_same_city"]
data[data["dist_to_center"]>10000]["dist_to_center"] = None


In [33]:
dep, indep, result, result_index = prepare_data(data)
indep_train, indep_test, dep_train, dep_test = train_test_split(indep, dep, test_size=0.40, random_state=32)
pmean, pmean_train, rmean = make_models(dep, indep, result, 5, 50000)  

0.0403864301023838 0.041046060931222345 50000


In [34]:
to_send=pd.DataFrame(rmean, columns=["target"])
to_send.index = result_index
to_send.index = to_send.index.astype(np.int)
to_send.to_csv(r"\result_50000.csv",",")

In [16]:
#for i in range (2, 40):
gs1 = lgb.LGBMRegressor(objective = 'regression',
                   max_depth = 5,
                   learning_rate = 0.068, lambda_l1 = 0.0009, lambda_l2 = 0.0064,
                   n_estimators = 95, n_jobs=10, num_leaves=17, min_child_samples = 10 )
gs1.fit(indep_train, dep_train, eval_set=[(indep_test, dep_test)], eval_metric='rmse', early_stopping_rounds=5, verbose=False);
rmse_scorer = make_scorer(rmse, greater_is_better=False)
cvs = cross_val_score(gs1, indep_test, dep_test, scoring=rmse_scorer, cv=5)

print(rmse(y_true=dep_test, y_pred=gs1.predict(indep_test)), cvs.mean()) 
#    """
#;

0.043684854295904275 -0.0444739704553


In [None]:
gs1 = lgb.LGBMRegressor(objective = 'regression',
                       max_depth = 5,
                       learning_rate = 0.068, lambda_l1 = 0.0009, lambda_l2 = 0.0064,
                       n_estimators = 95, n_jobs=10, num_leaves=17, min_child_samples = 10 )
gs1.fit(indep_train, dep_train, eval_set=[(indep_test, dep_test)], eval_metric='rmse', early_stopping_rounds=5, verbose=False);
rmse_scorer = make_scorer(rmse, greater_is_better=False)
cvs = cross_val_score(gs1, indep_test, dep_test, scoring=rmse_scorer, cv=5)

print(rmse(y_true=dep_test, y_pred=gs1.predict(indep_test)), -cvs.mean()) 

attr2 = dict(zip(indep.columns, gs1.feature_importances_))
attr2 = sorted(attr2.items(), key=lambda x: x[1], reverse = False)
x1,y1 = zip(*attr2)
i1=range(len(x1))
plt.figure(num=None, figsize=(9, 12), dpi=300, facecolor='w', edgecolor='k')
plt.barh(i1, y1)
plt.title("LGBM")
plt.yticks(i1, x1)
plt.show();