In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso
import lightgbm as lgb
from sklearn.tree import DecisionTreeRegressor

from sklearn import preprocessing
import xgboost as xgb

In [2]:
train = pd.read_csv('Train.csv.zip', compression='zip')
test = pd.read_csv('Test.csv.zip', compression='zip')

In [3]:
# Save the 'Id' column
train_ID = train['id']
test_ID = test['id']

# Now drop the 'Id' column since it's unnecessary for the prediction process.
train.drop("id", axis = 1, inplace = True)
test.drop("id", axis = 1, inplace = True)

In [4]:
# настройка параметров
train[['build_tech']] = train[['build_tech']].replace(np.nan, 0.5)
train[['metro_dist']] = train[['metro_dist']].replace(np.nan, 15.0)
train[['g_lift']] = train[['g_lift']].replace(np.nan, 0.5)

In [5]:
train['area_per_room'] = train['area'] / train['rooms']
test['area_per_room'] = test['area'] / test['rooms']

In [6]:
test[['build_tech']] = test[['build_tech']].replace(np.nan, 0.5)
test[['metro_dist']] = test[['metro_dist']].replace(np.nan, 15.0)
test[['g_lift']] = test[['g_lift']].replace(np.nan, 0.5)

In [7]:
train['street_id'] = train['street_id'].astype(str)
test['street_id'] = test['street_id'].astype(str)

In [8]:
X = train[[
#     'date', 
    'street_id', 
    'build_tech', 
    'floor', 
    'area', 'area_per_room',
    'rooms',
    'balcon', 
    'metro_dist', 
    'g_lift', 
    'n_photos', 
#     'kw1', 'kw2', 'kw3', 'kw4', 'kw5', 'kw6', 'kw7', 'kw8', 'kw9', 'kw10', 'kw11', 'kw12', 'kw13'
]]
y = train[['price']]

In [9]:
X.head()

Unnamed: 0,street_id,build_tech,floor,area,area_per_room,rooms,balcon,metro_dist,g_lift,n_photos
0,164,1.0,5,50,25.0,2,1,20.0,0.0,3
1,66,0.5,5,48,24.0,2,0,20.0,0.5,0
2,642,2.0,21,61,30.5,2,0,10.0,0.0,0
3,562,1.0,11,95,31.666667,3,1,10.0,0.5,2
4,151,0.5,9,34,34.0,1,0,20.0,0.5,2


In [10]:
test = test[[
    'street_id', 
    'build_tech', 
    'floor', 
    'area', 'area_per_room',
    'rooms',
    'balcon', 
    'metro_dist', 
    'g_lift', 
    'n_photos', 
]]

In [11]:
# Cross-validation with k-folds
n_folds = 3

def mae_cv(model, X, y, name):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X.values)
    mae= -cross_val_score(model, X.values, y, scoring="neg_mean_absolute_error", cv = kf)
    print(name + " score: {:.4f}".format(mae.mean()))

In [12]:
knn1 = KNeighborsRegressor(n_neighbors=3) # 1554037.1373234342
# mae_cv(knn1, X, y, 'knn-3')

knn2 = KNeighborsRegressor(n_neighbors=10) # 1580934.9331793853
# mae_cv(knn2, X, y, 'knn-10')

knn3 = KNeighborsRegressor(n_neighbors=1) # 1578260.9034
# mae_cv(knn3, X, y, 'knn-1')

knn4 = KNeighborsRegressor(n_neighbors=7) # 1486361.3620
# mae_cv(knn4, X, y, 'knn-7')

dtr1 = DecisionTreeRegressor(max_depth=100, random_state=0) # 1772791.7349
# mae_cv(dtr1, X, y, 'dtr-1')

rg1 = Ridge(alpha=0.01, random_state=0) # 1692388.0292463482
# mae_cv(rg1, X, y, 'rg1-0.01')

rg2 = Ridge(alpha=1.1, random_state=0) # 1692378.7748252843
# mae_cv(rg2, X, y, 'rg1-1.1')

rg3 = Ridge(alpha=100.1, random_state=0) # 1692401.9077769322
# mae_cv(rg3, X, y, 'rg1-100.1')

lasso1 = Lasso(alpha=0.01, random_state=0) # 1692388.1266646797
# mae_cv(lasso1, X, y, 'lasso-1.01')

lasso2 = Lasso(alpha=1.1, random_state=0) # 1692387.7184467418
# mae_cv(lasso2, X, y, 'lasso-1.1')

lasso3 = Lasso(alpha=100.1, random_state=0) # 1692371.585207761
# mae_cv(lasso3, X, y, 'lasso-100.1')

# knn-3 score: 1554022.4553
# knn-10 score: 1580582.9061
# rg1-0.01 score: 1693203.7011
# rg1-1.1 score: 1693203.9926
# rg1-100.1 score: 1693235.5966
# lasso-1.01 score: 1693203.6984
# lasso-1.1 score: 1693203.6945
# lasso-100.1 score: 1693203.3706

# with area_per_room
# knn-3 score: 1478608.8294
# knn-10 score: 1502786.2837
# rg1-0.01 score: 1691307.9548
# rg1-1.1 score: 1691309.9928
# rg1-100.1 score: 1691494.4104
# lasso-1.01 score: 1691307.9375
# lasso-1.1 score: 1691308.0841
# lasso-100.1 score: 1691321.5217

In [13]:
rf1 = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0) # 1360153.8352
# mae_cv(rf1 ,X, y, 'rf-100')

rf2 = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0) # 1354697.7572
# mae_cv(rf2 ,X, y, 'rf-1000')

In [14]:
gbm1 = lgb.LGBMRegressor(boosting_type='gbdt', learning_rate=0.05, max_depth=2, n_estimators=200, nthread=-1, objective='regression', random_state=0) 
# 1659428.4293
# mae_cv(gbm1 ,X, y, 'gbm1-2')

In [15]:
gbm2 = lgb.LGBMRegressor(boosting_type='gbdt', learning_rate=0.05, max_depth=5, n_estimators=200, nthread=-1, objective='regression') 
# # 
# mae_cv(gbm2 ,X, y, 'gbm1-5')

In [16]:
gb1 = GradientBoostingRegressor(n_estimators=100, random_state=0) # 1557468.3937, 1544474.4795
# mae_cv(gb1 , X, y, 'gbm1-100')

In [17]:
gb2 = GradientBoostingRegressor(n_estimators=17500, random_state=0) # 680773.1029
# mae_cv(gb2 , X, y, 'gbm2-17500')

In [18]:
def stack_pred(estimator, X1, y1, k):    
    def put_on_places(arr, elems, places):
        for i in range(0, len(elems)):
            arr[places[i]] = elems[i]
        
    sX = [3] * len(X)    
    kf = KFold(n_splits=k, shuffle=True, random_state=0)
    for train_index, test_index in kf.split(X1):
        X_train, X_test = X1.values[train_index], X1.values[test_index]
        y_train, y_test = y1.values[train_index], y1.values[test_index]
        estimator.fit(X_train, y_train)
        pred = estimator.predict(X_test) 
        
        put_on_places(sX, pred, test_index)  
    return np.array(sX)

In [19]:
# columns = ['knn1', 'knn2', 'rg1', 'rg2', 'rg3', 'lasso1', 'lasso2', 'lasso3', 'rf1', 'rf2', 'gbm1', 'gbm2', 'gb1', 'gb2']
columns = ['knn1', 'knn2', 'knn3', 'knn4', 'dtr1', 'rg1', 'rg2', 'rg3', 'lasso1', 'lasso2', 'lasso3', 'rf1', 'rf2', 'gbm1', 'gbm2', 'gb1', 'gb2']
# columns = ['knn1', 'knn2', 'knn3', 'knn4', 'dtr1', 'rg1', 'rg2', 'rg3', 'lasso1', 'lasso2', 'lasso3', 'rf1', 'rf2', 'gbm1', 'gbm2', 'gb1', 'gb2']

In [20]:
df = pd.DataFrame(index=range(0, y.shape[0]), columns=columns)

In [21]:
df['knn1'] = stack_pred(knn1, X, y, 3)

In [22]:
df['knn2'] = stack_pred(knn2, X, y, 3)

In [23]:
df['knn3'] = stack_pred(knn3, X, y, 3)

In [24]:
df['knn4'] = stack_pred(knn3, X, y, 3)

In [25]:
df['dtr1'] = stack_pred(dtr1, X, y, 3)

In [26]:
df['rg1'] = stack_pred(rg1, X, y, 3)

In [27]:
df['rg2'] = stack_pred(rg2, X, y, 3)

In [28]:
df['rg3'] = stack_pred(rg3, X, y, 3)

In [29]:
df['lasso1'] = stack_pred(lasso1, X, y, 3)

In [30]:
df['lasso2'] = stack_pred(lasso2, X, y, 3)

In [31]:
df['lasso3'] = stack_pred(lasso3, X, y, 3)

In [32]:
df['rf1'] = stack_pred(rf1, X, y, 3)

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


In [33]:
df['rf2'] = stack_pred(rf2, X, y, 3)

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


In [34]:
df['gbm1'] = stack_pred(gbm1, X, y, 3)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [35]:
df['gbm2'] = stack_pred(gbm2, X, y, 3)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [36]:
df['gb1'] = stack_pred(gb1, X, y, 3)

  y = column_or_1d(y, warn=True)


In [37]:
df['gb2'] = stack_pred(gb2, X, y, 3)

  y = column_or_1d(y, warn=True)


In [38]:
# df.drop(['gb2'], axis=1, inplace=True)
# df

In [39]:
# ens_model = GradientBoostingRegressor(n_estimators=1000, random_state=0) # 670503.0048
# ens_model = GradientBoostingRegressor(n_estimators=100, random_state=0) # 668383.9201
# ens_model = GradientBoostingRegressor(n_estimators=50, random_state=0) # 673930.6910
# ens_model = GradientBoostingRegressor(n_estimators=20, random_state=0) # 784558.3308
# ens_model = GradientBoostingRegressor(n_estimators=35, random_state=0) # 685177.8217
# ens_model = GradientBoostingRegressor(n_estimators=42, random_state=0) # 677345.2444
ens_model = GradientBoostingRegressor(n_estimators=200, random_state=0) # 667150.5400
# ens_model = GradientBoostingRegressor(n_estimators=500, random_state=0) # 667557.5129

mae_cv(ens_model, df, y, 'ens_model')

# оценка при ens_model area_per_room 661800.9608 (с учетом gb2)
# 1256821.3418 - без gb2
# 1230369.1392 - рямо солидно увеличилось
# давай еще)
#  учетом knn4 1230350.0998
#  учетом dtr1 1227584.8178
#  учетом xgb1 1230859.0198 - хуже - предлагаю сабмитить как есть и идти на тренировку
#  лан еще разок 672621.0234

# ens_model score: 652164.0166 - xgb + 
# ens_model score: 662474.6392 - без xgb

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


ens_model score: 662474.6392


In [40]:
df_test = pd.DataFrame(index=range(0, y.shape[0]), columns=columns)

In [41]:
knn1.fit(X, y)
knn2.fit(X, y)
knn3.fit(X, y)
knn4.fit(X, y)
dtr1.fit(X, y)
rg1.fit(X, y)
rg2.fit(X, y)
rg3.fit(X, y)
lasso1.fit(X, y)
lasso2.fit(X, y)
lasso3.fit(X, y)
rf1.fit(X, y)
rf2.fit(X, y)

  if sys.path[0] == '':
  del sys.path[0]


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [42]:
X_for_gbm = X[[
#     'street_id', 
    'build_tech', 
    'floor', 
    'area', 'area_per_room',
    'rooms',
    'balcon', 
    'metro_dist', 
    'g_lift', 
    'n_photos', 
]]

In [43]:
gbm1.fit(X_for_gbm, y)
gbm2.fit(X_for_gbm, y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.05, max_depth=5,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=200, n_jobs=-1, nthread=-1, num_leaves=31,
       objective='regression', random_state=None, reg_alpha=0.0,
       reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0)

In [45]:
gb1.fit(X, y)
gb2.fit(X, y)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=17500, presort='auto', random_state=0,
             subsample=1.0, verbose=0, warm_start=False)

In [46]:
# columns = ['knn1', 'knn2', 'rg1', 'rg2', 'rg3', 'lasso1', 'lasso2', 'lasso3', 'rf1', 'rf2', 'gbm1', 'gbm2', 'gb1', 'gb2']
df_test['knn1'] = knn1.predict(test)
df_test['knn2'] = knn2.predict(test)
df_test['knn3'] = knn3.predict(test)
df_test['knn4'] = knn4.predict(test)
df_test['dtr1'] = dtr1.predict(test)
df_test['rg1'] = rg1.predict(test)
df_test['rg2'] = rg2.predict(test)
df_test['rg3'] = rg3.predict(test)
df_test['lasso1'] = lasso1.predict(test)
df_test['lasso2'] = lasso2.predict(test)
df_test['lasso3'] = lasso3.predict(test)
df_test['rf1'] = rf1.predict(test)
df_test['rf2'] = rf2.predict(test)

In [47]:
test_for_gbm = test[[
#     'street_id', 
    'build_tech', 
    'floor', 
    'area', 'area_per_room',
    'rooms',
    'balcon', 
    'metro_dist', 
    'g_lift', 
    'n_photos', 
]]

In [48]:
df_test['gbm1'] = gbm1.predict(test_for_gbm)
df_test['gbm2'] = gbm2.predict(test_for_gbm)

In [49]:
df_test['gb1'] = gb1.predict(test)
df_test['gb2'] = gb2.predict(test)

In [50]:
# df_test

In [51]:
ens_model.fit(df, y)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=200, presort='auto', random_state=0,
             subsample=1.0, verbose=0, warm_start=False)

In [52]:
pred_rez = ens_model.predict(df_test)

In [53]:
rez_test = test

In [54]:
rez_test['price'] = pred_rez
rez_test['id'] = test_ID.values

rez_test[['id', 'price']].to_csv('sub.csv', index=False)

In [55]:
# 759599.73973 - добавка knn ничего не дала