In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sys
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MultiLabelBinarizer
from datetime import datetime


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

In [2]:
# всегда фиксируйте RANDOM_SEED, чтобы ваши эксперименты были воспроизводимы!
RANDOM_SEED = 42

# Setup

In [3]:
VERSION    = 11
#DIR_TRAIN  = '../input/sf-autoru-solve-v4/' # подключил к ноутбуку свой внешний датасет
#DIR_TEST   = '../input/sf-dst-car-price/'
VAL_SIZE   = 0.33   # 33%
N_FOLDS    = 5

# CATBOOST
ITERATIONS = 2000
LR         = 0.1

# Data

In [4]:
#cars = pd.read_csv('united_3.csv')
cars = pd.read_csv('cars_new.csv')
#cars.drop(columns=['Unnamed: 0'], inplace=True)
cars.head()

Unnamed: 0,test,style,body,color,fuel,model_date,doors,production_date,transmission,engine_displacement,engine_power,mileage,drive_type,wheel,condition,owners,vehicle_title,features,price,tax,tax_per_horse,days_in_use,mileage_per_day,years_in_use
0,0,520i,body 1,color 3,бензин,model date 6,4,1991,механическая,eng dis 2,129,300000.0,задний,Левый,Не требует ремонта,3,Оригинал,Unknown,60000.0,4515.0,35,10799,27.78035,29
1,0,316i,body 1,color 1,бензин,model date 7,4,1992,механическая,eng dis 4,102,300000.0,задний,Левый,Не требует ремонта,3,Дубликат,Unknown,68000.0,2550.0,25,10434,28.752156,28
2,0,318i,body 1,color 5,бензин,model date 7,4,1994,механическая,eng dis 5,115,350000.0,задний,Левый,Не требует ремонта,3,Дубликат,Unknown,70000.0,2875.0,25,9703,36.071318,26
3,0,520i,body 1,color 1,бензин,model date 6,4,1992,механическая,eng dis 2,150,100000.0,задний,Левый,Не требует ремонта,3,Оригинал,Unknown,70000.0,5250.0,35,10434,9.584052,28
4,0,520i,body 1,color 1,бензин,model date 6,4,1982,механическая,eng dis 2,125,250000.0,задний,Левый,Не требует ремонта,1,Оригинал,Unknown,70000.0,3125.0,25,14086,17.748119,38


In [5]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10168 entries, 0 to 10167
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   test                 10168 non-null  int64  
 1   style                10168 non-null  object 
 2   body                 10168 non-null  object 
 3   color                10168 non-null  object 
 4   fuel                 10168 non-null  object 
 5   model_date           10168 non-null  object 
 6   doors                10168 non-null  int64  
 7   production_date      10168 non-null  int64  
 8   transmission         10168 non-null  object 
 9   engine_displacement  10167 non-null  object 
 10  engine_power         10168 non-null  int64  
 11  mileage              10168 non-null  float64
 12  drive_type           10168 non-null  object 
 13  wheel                10168 non-null  object 
 14  condition            10168 non-null  object 
 15  owners               10168 non-null 

In [6]:
cars['style'] = cars.apply(lambda x: 'Unknown' if pd.isna(x.style) else x.style, axis=1)
cars['engine_displacement'] = cars.apply(lambda x: 'eng dis 1' if pd.isna(x.engine_displacement) else x.engine_displacement, axis=1)

In [7]:
#перевод категориальных str в int, для методов которым на вход необходимо int.
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer

le = LabelEncoder()

le.fit(cars['style'])
cars['style'] = le.transform(cars['style'])

le.fit(cars['body'])
cars['body'] = le.transform(cars['body'])


le.fit(cars['color'])
cars['color'] = le.transform(cars['color'])

le.fit(cars['fuel'])
cars['fuel'] = le.transform(cars['fuel'])

le.fit(cars['model_date'])
cars['model_date'] = le.transform(cars['model_date'])

le.fit(cars['transmission'])
cars['transmission'] = le.transform(cars['transmission'])

le.fit(cars['engine_displacement'])
cars['engine_displacement'] = le.transform(cars['engine_displacement'])

le.fit(cars['drive_type'])
cars['drive_type'] = le.transform(cars['drive_type'])

le.fit(cars['wheel'])
cars['wheel'] = le.transform(cars['wheel'])

le.fit(cars['condition'])
cars['condition'] = le.transform(cars['condition'])

le.fit(cars['vehicle_title'])
cars['vehicle_title'] = le.transform(cars['vehicle_title'])

In [8]:
train = cars[cars['test'] == 0]
train.reset_index(drop=True)
test = cars[cars['test'] == 1]
test.reset_index(drop=True)
sample_submission = pd.read_csv('sample_submission.csv')

# Data Preprocessing

In [9]:
def preproc_data(df_input):
    '''includes several functions to pre-process the predictor data.'''
    
    df_output = df_input.copy()
    
    # ################### Предобработка ############################################################## 
    # убираем не нужные для модели признаки
    #df_output.drop(['Таможня', 'Состояние', 'id'], axis=1, inplace=True,)
    
    
    # ################### Feature Engineering ####################################################
    df_output['years_in_use'] = df_output['production_date'].apply(lambda x: datetime.now().year-x) #количество лет использования машины
    
    df_output['mileage_per_day'] = df_output['mileage'] / (df_output['years_in_use'] * 365) #пробег машины в день
    df_output['mileage_per_day'] = df_output.apply(lambda x: x.mileage_per_day if x.years_in_use !=0 else 0, axis=1)
    
    # Переведем комплектацию в dummy-переменные
    #df_output['features'] = df_output.apply(lambda x: '[Unknown]' if x.features == 'Unknown' else x.features, axis=1)
    #df_output['features'] = df_output['features'].apply(lambda x: str(x)[1:-1])
    #df_output['features'] = df_output['features'].apply(lambda x: x.replace("'", ""))
    #features = df_output['features'].str.get_dummies(sep=',')
    #df_output = df_output.join(features)
    
    # ################### fix ############################################################## 
    # Переводим признаки из float в int (иначе catboost выдает ошибку)
    for feature in ['engine_power', 'mileage', 'production_date', 'owners','tax', 'price', 'mileage_per_day']:
        df_output[feature]=df_output[feature].astype('int32')
    #for feature in ['mileage', 'price', 'tax', 'mileage_per_day']:
        #df_output[feature]=df_output[feature].astype('int32')
    
    
    # ################### Clean #################################################### 
    # убираем признаки которые еще не успели обработать, 
    #df_output.drop(['description', 'features', 'test'], axis=1, inplace=True)
    df_output.drop(['features'], axis=1, inplace=True)
    
    return df_output

In [10]:
train_preproc = preproc_data(train)
X_sub = preproc_data(test)

In [11]:
X = train_preproc.drop(['price'], axis=1)
y = train_preproc.price.values
X_sub = X_sub.drop(['price'], axis=1)

# Train Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=VAL_SIZE, shuffle=True, random_state=RANDOM_SEED)

# CatBoost

In [13]:
# чтобы не писать весь список этих признаков, просто вывел их через nunique(). и так сойдет)
X_train.nunique()

test                      1
style                   152
body                      4
color                     6
fuel                      3
model_date                9
doors                     4
production_date          36
transmission              3
engine_displacement       5
engine_power             68
mileage                1614
drive_type                3
wheel                     1
condition                 1
owners                    3
vehicle_title             2
tax                      68
tax_per_horse             8
days_in_use              36
mileage_per_day         152
years_in_use             36
dtype: int64

In [14]:
# Keep list of all categorical features in dataset to specify this for CatBoost
cat_features_ids = np.where(X_train.apply(pd.Series.nunique) < 3000)[0].tolist()

In [16]:
X_sub.reset_index(drop=True)

Unnamed: 0,test,style,body,color,fuel,model_date,doors,production_date,transmission,engine_displacement,engine_power,mileage,drive_type,wheel,condition,owners,vehicle_title,tax,tax_per_horse,days_in_use,mileage_per_day,years_in_use
0,1,87,0,0,2,2,4,2017,0,1,190,158836,0,0,0,1,1,9500,50,1302,145,3
1,1,38,0,0,2,0,4,2019,0,1,150,10,0,0,0,1,1,5250,35,572,0,1
2,1,114,0,0,0,4,4,2012,0,0,407,120000,2,0,0,2,1,61050,150,3129,41,8
3,1,32,1,0,2,2,5,2015,0,0,249,111466,2,0,0,2,1,18675,75,2033,61,5
4,1,163,1,0,2,2,5,2019,0,0,381,11891,2,0,0,1,1,57150,150,572,32,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3832,1,89,0,0,0,1,4,2013,0,1,184,98000,0,0,0,3,1,9200,50,2763,38,7
3833,1,32,1,0,2,1,5,2016,0,0,249,169615,2,0,0,1,1,18675,75,1668,116,4
3834,1,95,0,0,0,5,4,2006,0,3,218,180000,2,0,0,3,1,14170,65,5320,35,14
3835,1,98,0,0,0,4,4,2012,0,1,245,137500,2,0,0,3,1,18375,75,3129,47,8


## Stacking

Давайте сначала разберемся, что-же такое этот Stacking.

Начнем с Ансамбля моделей:
Допустим, вы обучили Различные модели. Теперь мы можем просто объединить их предсказания и получить средневзвешенное предсказание по всем моделям. При этом, чем разнообразней модели - тем лучше результат мы получим. 

А теперь Stacking:
У нас есть предсказания от разных моделей, почему бы не использовать их как новые признаки/фитчи и не построить поверх этих предсказаний новую модель? Это основная идея Stacking-a.

Далее его еще можно бесконечно усложнять. Например, добавляя модели обученные на разных выборках и/или с разным составом признаков (bagging), или увеличивая уровни стекинга.

In [18]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from collections import defaultdict


def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true))

def print_regression_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    print(f'RMSE = {rmse:.2f}, MAE = {mae:.2f}, R-sq = {r2:.2f}, MAPE = {mape:.2f} ')

In [19]:
from sklearn.base import clone
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    """
    Computes meta-features usinf the classifier cls
    
    :arg model: scikit-learn classifier
    :arg X_train, y_train: training set
    :arg X_test: testing set
    :arg cv: cross-validation folding
    """
    
    X_meta_train = np.zeros_like(y_train, dtype = np.float32)
    X_meta_test = np.zeros(len(X_test), dtype=np.float32)
    for train_fold_index, predict_fold_index in cv.split(X_train):
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict(X_fold_predict)
        
        print_regression_metrics(X_meta_train[predict_fold_index], y_train[predict_fold_index])
        X_meta_test += folded_clf.predict(X_test)
    
    n = cv.n_splits
    X_meta_test = X_meta_test / n
    
    return X_meta_train, X_meta_test

In [20]:
def generate_meta_features(regressors, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(regressors)
    ]
    
    stacked_features_train = np.stack([
        features_train for features_train, features_test in features
    ], axis=-1)

    stacked_features_test = np.stack([
        features_test for features_train, features_test in features
    ], axis=-1)
    
    return stacked_features_train, stacked_features_test

In [21]:
#X = train_preproc.drop(['price'], axis=1)
#y = train_preproc.price.values
#X_sub = X_sub.drop(['price'], axis=1)

#X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size = VAL_SIZE, random_state=RANDOM_SEED)

cv = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return print_regression_metrics(y_test, y_test_pred)

In [25]:
from sklearn.preprocessing import StandardScaler
# Стандартизируем данные:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
X_sub = scaler.fit_transform(X_sub)

stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestRegressor(n_estimators=100, random_state=RANDOM_SEED),
    BaggingRegressor(ExtraTreesRegressor(n_estimators=100, random_state=RANDOM_SEED)),
    CatBoostRegressor(loss_function = 'MAE',
                         eval_metric = 'MAPE',
                         learning_rate=0.005,
                         iterations=4500,
                         l2_leaf_reg=2,
                         depth=6,
                         bootstrap_type = 'Bayesian',
                         random_seed=42,
                         od_type='Iter',
                         od_wait=100)
    ], X_train, X_sub, y_train, cv)


#Строим мета-алгоритм

final_model = LinearRegression()
final_model.fit(stacked_features_train, y_train)

# compute_metric(final_model, X_train=stack_future_train, X_test=stack_future_test, y_train=y_train)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

RMSE = 346992.02, MAE = 153914.27, R-sq = 0.91, MAPE = 0.13 
RMSE = 365811.00, MAE = 180178.91, R-sq = 0.87, MAPE = 0.15 
RMSE = 337803.03, MAE = 185668.25, R-sq = 0.92, MAPE = 0.15 
RMSE = 324522.31, MAE = 170072.45, R-sq = 0.92, MAPE = 0.15 
RMSE = 266440.34, MAE = 158020.42, R-sq = 0.94, MAPE = 0.14 
RMSE = 318975.77, MAE = 152684.49, R-sq = 0.93, MAPE = 0.13 
RMSE = 332935.60, MAE = 168185.04, R-sq = 0.89, MAPE = 0.15 
RMSE = 318790.29, MAE = 181186.28, R-sq = 0.93, MAPE = 0.14 
RMSE = 315770.23, MAE = 165397.92, R-sq = 0.92, MAPE = 0.15 
RMSE = 277172.78, MAE = 162291.56, R-sq = 0.94, MAPE = 0.14 
0:	learn: 0.7904926	total: 6.3ms	remaining: 28.3s
1:	learn: 0.7873030	total: 11.2ms	remaining: 25.1s
2:	learn: 0.7838232	total: 16ms	remaining: 24s
3:	learn: 0.7804052	total: 21ms	remaining: 23.6s
4:	learn: 0.7772165	total: 25.7ms	remaining: 23.1s
5:	learn: 0.7739279	total: 30.7ms	remaining: 23s
6:	learn: 0.7705893	total: 33.9ms	remaining: 21.7s
7:	learn: 0.7671651	total: 38.5ms	remainin

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [26]:
y_pred = np.round((final_model.predict(stacked_features_test)/1000))*1000

sample_submission = pd.read_csv('sample_submission.csv')
sample_submission['price'] =  y_pred
sample_submission.to_csv(f'submission_stack_v1.csv', index=False)

sample_submission.head(10)

Unnamed: 0,id,price
0,0,1498000.0
1,1,2217000.0
2,2,1038000.0
3,3,2006000.0
4,4,5568000.0
5,5,1569000.0
6,6,1009000.0
7,7,653000.0
8,8,1199000.0
9,9,1268000.0
