## О чём
В этом ноутбуке приведено моё решение соревнования по машинному обучения от `Rucode`

Цель соревнования - предсказание цены иномарки по её характеристикам, подробная информация [здесь](https://www.kaggle.com/c/rucode-carprice)

In [8]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, cross_val_predict
from scipy import stats
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.neighbors import NearestNeighbors
from vecstack import stacking

In [9]:
def make_label_enc(train, test, cats):
    train[cats] = train[cats].fillna('unknown')
    test[cats] = test[cats].fillna('unknown')
    for cat in cats:
        enc = LabelEncoder()
        enc.fit(train[cat])
        enc_dict = dict(zip(enc.classes_, enc.transform(enc.classes_)))
        if 'unknown' not in list(enc_dict.keys()):
            enc_dict['unknown'] = len(enc_dict)
        test.loc[test[cat].isin(list(enc_dict.keys())) == False, cat] = 'unknown'
        train[cat + '_enc'] = train[cat].apply(lambda x: enc_dict[x])
        test[cat + '_enc'] = test[cat].apply(lambda x: enc_dict[x])
        
        
def make_submit(name, test, preds):
    d = {'Id': test.index, 'Price': preds}
    pd.DataFrame(d).to_csv(name, index=False)
    

def RMSLE(y, pred):
    return mean_squared_error(y, pred) ** 0.5

In [10]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [11]:
train = pd.read_csv('data_train.csv', index_col=0)
test = pd.read_csv('data_test.csv', index_col=0)

rename = {'Марка': 'brand', 'Модель': 'model', 'Год выпуска': "prod_year", 'Рабочий объем': 'volume',
         'Мощность двигателя': 'power', 'Тип топлива': 'fuel_type', 'Трансмиссия': 'transmission',
          'Количество мест': 'capacity', 'Количество дверей': 'doors', 'Пробег': 'mileage', 'Цена': 'price'}

train.rename(mapper=rename, axis=1, inplace=True)
test.rename(mapper=rename, axis=1, inplace=True)

num_features = ['prod_year', 'volume', 'power', 'capacity', 'doors', 'mileage']
cat_features = ['brand', 'model', 'fuel_type', 'transmission']
cat_features_enc = [cat + '_enc' for cat in cat_features]

train = train[(train.price > 1e4) & (train.price < 1e7)]

make_label_enc(train, test, cat_features)
# train_test = pd.concat((train, test))
# train_test[num_features] = train_test[num_features].fillna(train_test[num_features].mean())
# train_test = train_test[num_features + cat_features_enc]
# train_test['gr_br'] = train_test.groupby('brand_enc')['brand_enc'].transform('count')
# train_test['gr_br_mod'] = train_test.groupby(['brand_enc', 'model_enc'])['brand_enc'].transform('count')
train[num_features] = train[num_features].fillna(train[num_features].mean())
test[num_features] = test[num_features].fillna(test[num_features].mean())
train = train[num_features + cat_features_enc + ['price']]
test = test[num_features + cat_features_enc]
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
X = train[num_features + cat_features_enc]
y = train['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Memory usage after optimization is: 4.30 MB
Decreased by 67.7%
Memory usage after optimization is: 1.91 MB
Decreased by 69.3%


In [12]:
p1 = ({'bagging_fraction': 0.9330343751389898,
      'bagging_freq': 1,
      'feature_fraction': 0.6,
      'lambda_l1': 0.00013416924638094717,
      'lambda_l2': 1.067295369784112e-05,
      'min_child_samples': 5,
      'num_leaves': 31,
      'n_estimators': 10000,
      'learning_rate': 0.01})
model1 = LGBMRegressor(**p1)
model1.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=100, eval_metric='l2')
print(r2_score(y_test, model1.predict(X_test, num_iteration=model1.best_iteration_)))

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 2.69693e+10
[200]	valid_0's l2: 1.64696e+10
[300]	valid_0's l2: 1.33737e+10
[400]	valid_0's l2: 1.21612e+10
[500]	valid_0's l2: 1.1584e+10
[600]	valid_0's l2: 1.1206e+10
[700]	valid_0's l2: 1.09148e+10
[800]	valid_0's l2: 1.07001e+10
[900]	valid_0's l2: 1.05323e+10
[1000]	valid_0's l2: 1.04211e+10
[1100]	valid_0's l2: 1.0312e+10
[1200]	valid_0's l2: 1.02248e+10
[1300]	valid_0's l2: 1.01518e+10
[1400]	valid_0's l2: 1.00947e+10
[1500]	valid_0's l2: 1.00374e+10
[1600]	valid_0's l2: 9.98789e+09
[1700]	valid_0's l2: 9.94971e+09
[1800]	valid_0's l2: 9.90573e+09
[1900]	valid_0's l2: 9.87808e+09
[2000]	valid_0's l2: 9.84394e+09
[2100]	valid_0's l2: 9.80932e+09
[2200]	valid_0's l2: 9.78628e+09
[2300]	valid_0's l2: 9.76368e+09
[2400]	valid_0's l2: 9.74632e+09
[2500]	valid_0's l2: 9.72192e+09
[2600]	valid_0's l2: 9.70239e+09
[2700]	valid_0's l2: 9.67421e+09
[2800]	valid_0's l2: 9.64161e+09
[2900]	valid_0's l2: 9.626

In [86]:
p1 = ({'bagging_fraction': 0.9330343751389898,
      'bagging_freq': 1,
      'feature_fraction': 0.6,
      'lambda_l1': 0.00013416924638094717,
      'lambda_l2': 1.067295369784112e-05,
      'min_child_samples': 5,
      'num_leaves': 31,
      'n_estimators': 6900,
      'learning_rate': 0.01})
model1 = LGBMRegressor(**p1)
model1.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=100, eval_metric='l2')
print(r2_score(y_test, model1.predict(X_test, num_iteration=model1.best_iteration_)))

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 2.56309e+10
[200]	valid_0's l2: 1.53335e+10
[300]	valid_0's l2: 1.24725e+10
[400]	valid_0's l2: 1.14198e+10
[500]	valid_0's l2: 1.09425e+10
[600]	valid_0's l2: 1.06148e+10
[700]	valid_0's l2: 1.03902e+10
[800]	valid_0's l2: 1.02164e+10
[900]	valid_0's l2: 1.00681e+10
[1000]	valid_0's l2: 9.95674e+09
[1100]	valid_0's l2: 9.87322e+09
[1200]	valid_0's l2: 9.78959e+09
[1300]	valid_0's l2: 9.72591e+09
[1400]	valid_0's l2: 9.68064e+09
[1500]	valid_0's l2: 9.62682e+09
[1600]	valid_0's l2: 9.57638e+09
[1700]	valid_0's l2: 9.53389e+09
[1800]	valid_0's l2: 9.49562e+09
[1900]	valid_0's l2: 9.45552e+09
[2000]	valid_0's l2: 9.4324e+09
[2100]	valid_0's l2: 9.40257e+09
[2200]	valid_0's l2: 9.38012e+09
[2300]	valid_0's l2: 9.35694e+09
[2400]	valid_0's l2: 9.3366e+09
[2500]	valid_0's l2: 9.32086e+09
[2600]	valid_0's l2: 9.3057e+09
[2700]	valid_0's l2: 9.28374e+09
[2800]	valid_0's l2: 9.265e+09
[2900]	valid_0's l2: 9.24736

In [64]:
p1 = ({'bagging_fraction': 0.9330343751389898,
      'bagging_freq': 1,
      'feature_fraction': 0.6,
      'lambda_l1': 0.00013416924638094717,
      'lambda_l2': 1.067295369784112e-05,
      'min_child_samples': 5,
      'num_leaves': 31,
      'n_estimators': 6900,
      'learning_rate': 0.01})
model1 = LGBMRegressor(**p1)
model1.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=100, eval_metric='l2')
print(r2_score(y_test, model1.predict(X_test, num_iteration=model1.best_iteration_)))

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 2.62746e+10
[200]	valid_0's l2: 1.57319e+10
[300]	valid_0's l2: 1.27934e+10
[400]	valid_0's l2: 1.16327e+10
[500]	valid_0's l2: 1.10552e+10
[600]	valid_0's l2: 1.07168e+10
[700]	valid_0's l2: 1.04827e+10
[800]	valid_0's l2: 1.02922e+10
[900]	valid_0's l2: 1.0163e+10
[1000]	valid_0's l2: 1.00375e+10
[1100]	valid_0's l2: 9.93698e+09
[1200]	valid_0's l2: 9.845e+09
[1300]	valid_0's l2: 9.77821e+09
[1400]	valid_0's l2: 9.7173e+09
[1500]	valid_0's l2: 9.66435e+09
[1600]	valid_0's l2: 9.61451e+09
[1700]	valid_0's l2: 9.58057e+09
[1800]	valid_0's l2: 9.54934e+09
[1900]	valid_0's l2: 9.5186e+09
[2000]	valid_0's l2: 9.4933e+09
[2100]	valid_0's l2: 9.46563e+09
[2200]	valid_0's l2: 9.43941e+09
[2300]	valid_0's l2: 9.4042e+09
[2400]	valid_0's l2: 9.38961e+09
[2500]	valid_0's l2: 9.36693e+09
[2600]	valid_0's l2: 9.34963e+09
[2700]	valid_0's l2: 9.32898e+09
[2800]	valid_0's l2: 9.3123e+09
[2900]	valid_0's l2: 9.2983e+09

In [23]:
p1 = ({'bagging_fraction': 0.9330343751389898,
      'bagging_freq': 1,
      'feature_fraction': 0.6,
      'lambda_l1': 0.00013416924638094717,
      'lambda_l2': 1.067295369784112e-05,
      'min_child_samples': 5,
      'num_leaves': 31,
      'n_estimators': 6900,
      'learning_rate': 0.01})
model1 = LGBMRegressor(**p1)
model1.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=100, eval_metric='l2')
print(r2_score(y_test, model1.predict(X_test, num_iteration=model1.best_iteration_)))

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 2.69475e+10
[200]	valid_0's l2: 1.64416e+10
[300]	valid_0's l2: 1.33256e+10
[400]	valid_0's l2: 1.21238e+10
[500]	valid_0's l2: 1.15466e+10
[600]	valid_0's l2: 1.11387e+10
[700]	valid_0's l2: 1.08593e+10
[800]	valid_0's l2: 1.06628e+10
[900]	valid_0's l2: 1.05054e+10
[1000]	valid_0's l2: 1.03831e+10
[1100]	valid_0's l2: 1.02675e+10
[1200]	valid_0's l2: 1.01851e+10
[1300]	valid_0's l2: 1.01209e+10
[1400]	valid_0's l2: 1.006e+10
[1500]	valid_0's l2: 9.99591e+09
[1600]	valid_0's l2: 9.94616e+09
[1700]	valid_0's l2: 9.89628e+09
[1800]	valid_0's l2: 9.84893e+09
[1900]	valid_0's l2: 9.81482e+09
[2000]	valid_0's l2: 9.78444e+09
[2100]	valid_0's l2: 9.75577e+09
[2200]	valid_0's l2: 9.72637e+09
[2300]	valid_0's l2: 9.70007e+09
[2400]	valid_0's l2: 9.68087e+09
[2500]	valid_0's l2: 9.65512e+09
[2600]	valid_0's l2: 9.63951e+09
[2700]	valid_0's l2: 9.61112e+09
[2800]	valid_0's l2: 9.58346e+09
[2900]	valid_0's l2: 9.56

In [37]:
model2 = RandomForestRegressor(n_estimators=1000, min_samples_leaf=2, n_jobs=-1)
model2.fit(X_train, y_train)
print(r2_score(y_test, model2.predict(X_test)))

0.8453780971061374


In [31]:
r2_score(y_test, model1.predict(X_test, num_iteration=model1.best_iteration_))

0.8730906723603783

In [13]:
p1 = ({'bagging_fraction': 0.9330343751389898,
      'bagging_freq': 1,
      'feature_fraction': 0.6,
      'lambda_l1': 0.00013416924638094717,
      'lambda_l2': 1.067295369784112e-05,
      'min_child_samples': 5,
      'num_leaves': 31,
      'n_estimators': 6900,
      'learning_rate': 0.01})
model1 = LGBMRegressor(**p1)
model2 = RandomForestRegressor(n_estimators=1000, min_samples_leaf=2, n_jobs=-1)
models = [model1, model2]

In [14]:
S_train, S_test = stacking(models,                   
                           X, y, test,   
                           regression=True,      
                           mode='oof_pred_bag',                             
                           metric=RMSLE,     
                           n_folds=4,             
                           shuffle=True,                       
                           verbose=2)

task:         [regression]
metric:       [RMSLE]
mode:         [oof_pred_bag]
n_models:     [2]

model  0:     [LGBMRegressor]
    fold  0:  [112274.92405356]
    fold  1:  [114958.17029719]
    fold  2:  [101603.11918803]
    fold  3:  [105007.83078980]
    ----
    MEAN:     [108461.01108214] + [5378.52548060]
    FULL:     [108594.28834569]

model  1:     [RandomForestRegressor]
    fold  0:  [121926.88157859]
    fold  1:  [121437.21284571]
    fold  2:  [107353.17008585]
    fold  3:  [112092.96036134]
    ----
    MEAN:     [115702.55621787] + [6212.28508410]
    FULL:     [115869.21075642]



In [15]:
final_model = LGBMRegressor(learning_rate=0.01, n_estimators=1600)
final_model.fit(S_train, y)
preds = final_model.predict(S_test)

In [16]:
make_submit('real_stack.csv', test, preds)