# Mercedes competition

Mercedes and Sberbank were my two first non training kaggle competitions. Unfortunately, I started to do them several days before deadline, therefore my results aren't very expressive. Nevertheless, it gave a good understanding of what data could be for these competitions.

Mercedes competition consists of prediction of necessary test time for mercedes automobiles using car's technical caracteristics and other car's data.

In [1]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import Imputer, StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

In [2]:
#importing dataset
df_train = pd.read_csv('train.csv', header = 0, index_col = 0)
df_test = pd.read_csv('test.csv', header = 0, index_col = 0)
y_train = df_train['y']
print(df_train.shape, df_test.shape)
df_train.head()

(4209, 377) (4209, 376)


Unnamed: 0_level_0,y,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,130.81,k,v,at,a,d,u,j,o,0,...,0,0,1,0,0,0,0,0,0,0
6,88.53,k,t,av,e,d,y,l,o,0,...,1,0,0,0,0,0,0,0,0,0
7,76.26,az,w,n,c,d,x,j,x,0,...,0,0,0,0,0,0,1,0,0,0
9,80.62,az,t,n,f,d,x,l,e,0,...,0,0,0,0,0,0,0,0,0,0
13,78.02,az,v,n,f,d,h,d,n,0,...,0,0,0,0,0,0,0,0,0,0


So we see here two particular features of this data set:

* data is anonymous, we don't know either names of the features, or categories of categorical features
* all features are either categorical (first eight features) or binary

So here we have to predict continous feature using only discrete features. Morevover we see, that 12 features in train dataset are constant features, so we will drop them.

In [3]:
#exploring categories
cat_sorted = df_train.astype('object').describe().iloc[1,:].sort_values(ascending = False)
cat_sorted

y       2545
X0        47
X2        44
X5        29
X1        27
X8        25
X6        12
X3         7
X4         4
X137       2
X128       2
X129       2
X130       2
X131       2
X132       2
X133       2
X134       2
X135       2
X136       2
X141       2
X138       2
X139       2
X140       2
X126       2
X142       2
X143       2
X144       2
X145       2
X146       2
X127       2
        ... 
X265       2
X267       2
X263       2
X255       2
X248       2
X249       2
X250       2
X251       2
X252       2
X253       2
X262       2
X254       2
X256       2
X261       2
X257       2
X258       2
X259       2
X260       2
X233       1
X93        1
X289       1
X330       1
X293       1
X107       1
X268       1
X347       1
X297       1
X11        1
X235       1
X290       1
Name: unique, dtype: object

In [4]:
# removing constant features
to_drop = list(cat_sorted[cat_sorted == 1].index)
df_train.drop(to_drop, axis = 1, inplace = True)
df_test.drop(to_drop, axis = 1, inplace = True)

In [5]:
#selecting binary features
df_train_cat = df_train.iloc[:,1:9]
df_test_cat = df_test.iloc[:,:8]
X_train_bin = df_train.iloc[:,9:].values
X_test_bin = df_test.iloc[:,8:].values

As we have seen, there are 8 categorical features and others are binary. So we will use three strategies to deal with categorical features and choose the best one:

* we will transform categorical features into dummy variables (so we'll have only binary features in our data set)
* we will give a numerical value for every category
* we will use frequency of every category 

In [6]:
#categorical features
df_all = pd.concat([df_train_cat, df_test_cat])
df_dummies = pd.get_dummies(df_all)
X_train_dummies = df_dummies.iloc[:len(df_train),:].values
X_test_dummies = df_dummies.iloc[len(df_train):,:].values

le = LabelEncoder()
for feature in df_all.columns:
    df_all[feature] = le.fit_transform(df_all[feature].values)
    df_train_cat[feature] = df_train_cat[feature].map(df_train_cat.groupby(feature).size()) / len(df_train_cat)
    df_test_cat[feature] = df_test_cat[feature].map(df_test_cat.groupby(feature).size()) / len(df_test_cat)
X_train_le = df_all.iloc[:len(df_train),:]
X_test_le = df_all.iloc[len(df_train):,:]
X_train_freq = df_train_cat.values
X_test_freq = df_test_cat.values

So we create three data sets, if we work with all features as binary, we have 567 features instead of 364

In [7]:
# train test sets
X_train_dummies = np.concatenate((X_train_dummies, X_train_bin), axis = 1)
X_test_dummies = np.concatenate((X_test_dummies, X_test_bin), axis = 1)

X_train_le = np.concatenate((X_train_le, X_train_bin), axis = 1)
X_test_le = np.concatenate((X_test_le, X_test_bin), axis = 1)

X_train_freq = np.concatenate((X_train_freq, X_train_bin), axis = 1)
X_test_freq = np.concatenate((X_test_freq, X_test_bin), axis = 1)
print(X_train_dummies.shape, X_train_le.shape, X_train_freq.shape)

(4209, 567) (4209, 364) (4209, 364)


As we could suppose, linear regression is useless here.

In [8]:
#linear regression
linreg = LinearRegression()
kf = KFold(n_splits = 4, shuffle = True, random_state = 147)
r2_dummies = np.mean(cross_val_score(linreg, X_train_dummies, y_train, cv = kf, scoring = 'r2'))
r2_le = np.mean(cross_val_score(linreg, X_train_le, y_train, cv = kf, scoring = 'r2'))
r2_freq = np.mean(cross_val_score(linreg, X_train_freq, y_train, cv = kf, scoring = 'r2'))
print(r2_dummies, r2_le, r2_freq)

-2.90199190359e+23 -3.62084835444e+18 -3.292542421e+20


Random forest works much better with only categorical and binary features

In [9]:
#random forest
rf_reg = RandomForestRegressor(n_estimators = 100, random_state = 147)
r2_dummies = np.mean(cross_val_score(rf_reg, X_train_dummies, y_train, cv = kf, scoring = 'r2'))
r2_le = np.mean(cross_val_score(rf_reg, X_train_le, y_train, cv = kf, scoring = 'r2'))
r2_freq = np.mean(cross_val_score(rf_reg, X_train_freq, y_train, cv = kf, scoring = 'r2'))
print(r2_dummies, r2_le, r2_freq)

0.500385123258 0.493534157463 0.482674752432


Gradient boosting works even better. Mind you, that for random forest dummy variables were the mosr efficient strategy, for gradient boosting category frequency shows the best result.

In [10]:
# gradient boosting
gb_reg = GradientBoostingRegressor(n_estimators = 100, random_state = 147)
r2_dummies = np.mean(cross_val_score(gb_reg, X_train_dummies, y_train, cv = kf, scoring = 'r2'))
r2_le = np.mean(cross_val_score(gb_reg, X_train_le, y_train, cv = kf, scoring = 'r2'))
r2_freq = np.mean(cross_val_score(gb_reg, X_train_freq, y_train, cv = kf, scoring = 'r2'))
print(r2_dummies, r2_le, r2_freq)

0.553578660386 0.561805064686 0.5670388735


Here we perform feature selection to make model's performance better and to reduce time for training. As we see, feature selection helps us to improve model.

In [11]:
# model selection
rf_reg.fit(X_train_dummies, y_train)
rf_dummies_model = SelectFromModel(rf_reg, prefit = True)
X_train_dummies_rf = rf_dummies_model.transform(X_train_dummies)
X_test_dummies_rf = rf_dummies_model.transform(X_test_dummies)

gb_reg.fit(X_train_freq, y_train)
gb_freq_model = SelectFromModel(gb_reg, prefit = True)
X_train_freq_gb = gb_freq_model.transform(X_train_freq)
X_test_freq_gb = gb_freq_model.transform(X_test_freq)

r2_rf_model = np.mean(cross_val_score(rf_reg, X_train_dummies_rf, y_train, cv = kf, scoring = 'r2'))
r2_gb_model = np.mean(cross_val_score(gb_reg, X_train_freq_gb, y_train, cv = kf, scoring = 'r2'))
print(X_train_dummies_rf.shape, X_train_freq_gb.shape, r2_rf_model, r2_gb_model)

(4209, 84) (4209, 79) 0.509206738668 0.571004474253


Hyperparameters tuning by grid search

In [17]:
grid_rf = {'n_estimators': [1250, 1500, 1750], 'max_features': ['auto', 'sqrt', 'log2']}
gs_rf = GridSearchCV(rf_reg, grid_rf, cv = kf, scoring = 'r2')
gs_rf.fit(X_train_dummies_rf, y_train)
print(gs_rf.best_score_, gs_rf.best_params_)

0.539962237452 {'n_estimators': 1500, 'max_features': 'sqrt'}


In [15]:
grid_gb = {'n_estimators': [165, 170, 175, 180, 185], 'learning_rate': [0.065, 0.07, 0.075, 0.08, 0.085], 
           'max_depth': [2, 3, 4]}
gs_gb = GridSearchCV(gb_reg, grid_gb, cv = kf, scoring = 'r2')
gs_gb.fit(X_train_freq_gb, y_train)
print(gs_gb.best_score_, gs_gb.best_params_)

0.577027638703 {'max_depth': 2, 'learning_rate': 0.075, 'n_estimators': 175}


In [21]:
#df_test['y'] = gs_rf.predict(X_test_dummies_rf)
#df_test['y'] = gs_gb.predict(X_test_freq_gb)
df_test['y'] = gs_rf.predict(X_test_dummies_rf) / 2 + gs_gb.predict(X_test_freq_gb) / 2
df_test[['y']].to_csv('prediction.csv', sep = ',', header = True, index = True)

Best public score: 0.55454, best private score: 0.54532. In this case 0.2 of data was used for public score and 0.8 for private, therefore difference in performance is huge. For example, top 3 in public leaderbord have lost about 1500 places in private. I have lost about 700 places.