**Note**

This notebook contains only preprocessing, model building and prediction codes. All the ML experiements were carried out in a separate notebook

In [4]:
!pip install category_encoders
!pip install catboost

Collecting catboost
  Downloading catboost-0.26-cp37-none-manylinux1_x86_64.whl (69.2 MB)
[K     |████████████████████████████████| 69.2 MB 5.1 kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26


In [5]:
#Import all libraries

import pandas as pd
import numpy as np
import category_encoders as ce

import pandas as pd
import numpy as np
import category_encoders as ce

from sklearn.metrics import mean_squared_log_error , make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler , RobustScaler , power_transform , PowerTransformer, KBinsDiscretizer
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.model_selection import KFold, cross_val_score , RepeatedStratifiedKFold 
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.feature_selection import RFE , RFECV
from sklearn.model_selection import RepeatedKFold
from sklearn.neural_network import MLPRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import RandomizedSearchCV

from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline

pd.set_option('display.max_rows', None)

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.cluster import KMeans

import math

In [8]:
import os
os.chdir('/content/drive/MyDrive/Saideepak_1st_Place_MATHCO.Thon.zip (Unzipped Files)/Saideepak_1st_Place_MATHCO.Thon')

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
#Read train and test data

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [10]:
def preprocess(data):
    
    # Removed strings present in the feature 'Mileage' using  lambda function
    data['Mileage'] = data['Mileage'].apply(lambda x: x.split(" ")[0]).astype(int)
   
    #Log transformation is applied on the feature 'Mileage'
    data['Mileage'] = data['Mileage'].apply(lambda x: np.log(1) if x == 0 else np.log(x) )
    
    #Null values are imputed with 1
    data['Levy'] = data['Levy'].apply(lambda x: 1 if x == '-' else x).astype(int)
    
    #All the rare values in the 'category' feature are combined together using lambda function
    data['Category'] = data['Category'].apply(lambda x: 'Heavy_vehicle' if x == 'Goods wagon' or x == 'Pickup'
                                              or x == 'Cabriolet' or x == 'Limousine' else x)

    #Using the 'Engine volume' feature a new feature is created called 'Turbo engine'
    data['Turbo_engine'] = data['Engine volume'].apply(lambda x: 'Yes' if x.split(" ")[-1] == 'Turbo' else 'No')
    data['Engine volume'] = data['Engine volume'].apply(lambda x: x.split(" ")[0]).astype(float)

    #A new feature called 'Car age' is created using 'Prod year'
    data['Car_age'] = 2021 - data['Prod. year']
    
    # 'Hybrid-hydrogen' and 'Hydrogen' are combined together in the 'fuel type' feature using lambda function
    data['Fuel type'] = data['Fuel type'].apply(lambda x: 'Hybrid-hydrogen' if x == 'Plug-in Hybrid' or x == 'Hydrogen' else x)
   
    
   #Log is applied on the feature 'Levy'
    data['Levy'] = np.log(data['Levy'])
  
    # Log is applied on the feature 'ID'
    data['ID'] = np.log(data['ID'])
   

    #Below are the various feature engineering using mathematical function
    data['ID*Levy'] = np.log((data['ID'] * (data['Levy'] + data['ID'] )))
    data['ID*Mileage'] = np.log((data['ID'] * (data['Mileage'] + data['ID'] )))
    data['ID*Airbags'] = np.log((data['ID'] * (data['Airbags'] + data['ID'] )))

    data['ID_inverse'] = 1/ data['ID']
    data['Mileage_inverse'] = 1/(data['Mileage'] + 0.5)
    data['sin_ID'] =  np.sin(data['ID'])
    data['sqrt_ID'] = np.sqrt(data['ID'])
    data['sqrt_mileage'] = np.sqrt(data['Mileage'] )
    data['sqrt_Levy'] = np.sqrt(data['Levy'])
    data['inverse_ID*mileage'] =   1/data['ID*Mileage']
    data['ID_inverse*sin_ID'] = data['ID_inverse']*data['sin_ID']
    return data

In [11]:
#Reading train data
x = train.drop('Price' , axis = 1)

# Log is applied on the target feature for better prediction. Finally after prediction we ll apply exponential function
y = np.log(train['Price'])


#Reading test data
x_test = test.drop('Price' , axis = 1)

In [12]:
#Calling preprocess function and applying on train data
data = preprocess(x)

In [13]:
#Fitting count encoder on the features 'Manufacturer' , 'Model' , 'Color' and 'Doors'
encoder_count = ce.CountEncoder(cols = ['Manufacturer' , 'Model' , 'Color' , 'Doors' ] , handle_unknown = 1)
df1 = encoder_count.fit_transform( data)


#Fitting one hot encoder on rest of the categorical features
encoder_onehot = ce.OneHotEncoder(cols= [ 'Category' , 'Fuel type' , 'Leather interior'  , 'Gear box type' , 
                                          'Drive wheels' , 'Wheel' ,  'Turbo_engine'])

train_final = encoder_onehot.fit_transform( df1)

#Few more feature engineering is done using mathematical function
train_final['Inverse_manufacturer'] = 1/train_final['Manufacturer']
train_final['Inverse_model'] = 1/train_final['Model']
train_final['model*manufacturer'] = train_final['Model'] * train_final['Manufacturer']
train_final['sin_inverse*model*manufacturer'] = np.sin(train_final['Inverse_manufacturer'] * train_final['Inverse_model'])
train_final['sin*manufacturer'] = np.sin(train_final['Manufacturer'])
train_final['sin*model'] = np.cos(train_final['Model'])
train_final['mileage*model'] = train_final['Model'] * train_final['Mileage']
train_final['Color*manufacturer'] = train_final['Color'] * train_final['Manufacturer']
train_final['ID*model'] = train_final['ID'] * train_final['Model']
train_final['cos_ID'] = np.cos(train_final['ID'])

In [14]:
#Train data is normalized using min max scaler for the purpose of doing clustering using KMeans algorithm
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(train_final)

In [15]:
#KMeans algorithm is applied on the traind data
#Number of clusters is selected using elbow plot and it is not included in this notebook

kmeans = KMeans(n_clusters=4, init='k-means++' , random_state = 1)

# fitting the k means algorithm on scaled data
kmeans.fit(data_scaled)
pred = kmeans.predict(data_scaled)
train_final['cluster'] = pred

encoder_onehot_cluster = ce.OneHotEncoder(cols= ['cluster'])
train_final = encoder_onehot_cluster.fit_transform(train_final)

In [16]:
#Ouantile transformation is done on train data
#This reduces the impact of outliers in the prediction
qt = QuantileTransformer(random_state=123 , output_distribution = 'normal' , n_quantiles=1000)


train_final = qt.fit_transform(train_final)


#### The above process is applied to the test data as well

In [17]:
#Preprocess function is applied on the test data
test_val = preprocess(x_test)

In [18]:
# All the fitted encoders are used for transforming the test data
test_set = encoder_count.transform(test_val)

test_final = encoder_onehot.transform(test_set)
test_final['Inverse_manufacturer'] = 1/test_final['Manufacturer']
test_final['Inverse_model'] = 1/test_final['Model']
test_final['model*manufacturer'] = test_final['Model'] * test_final['Manufacturer']
test_final['sin_inverse*model*manufacturer'] = np.sin(test_final['Inverse_manufacturer'] * test_final['Inverse_model'])
test_final['sin*manufacturer'] = np.sin(test_final['Manufacturer'])
test_final['sin*model'] = np.cos(test_final['Model'])
test_final['mileage*model'] = test_final['Model'] * test_final['Mileage']
test_final['Color*manufacturer'] = test_final['Color'] * test_final['Manufacturer']
test_final['ID*model'] = test_final['ID'] * test_final['Model']
test_final['cos_ID'] = np.cos(test_final['ID'])

scale_test = scaler.transform(test_final)

test_final['cluster'] = kmeans.predict(scale_test)
test_final = encoder_onehot_cluster.transform(test_final)



test_final = qt.transform(test_final)


In [19]:
print(train_final.shape)
print(test_final.shape)

(19237, 64)
(8245, 64)


#### Model building

In [20]:
#Fitting extra tree regressor model on the train data
from sklearn.ensemble import ExtraTreesRegressor
etr = ExtraTreesRegressor(random_state = 123 , max_depth = 45  , n_estimators = 400)
etr.fit(train_final , y)

ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=45, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=400, n_jobs=None, oob_score=False,
                    random_state=123, verbose=0, warm_start=False)

In [21]:
#Fitting light gbm model on the train data
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor(random_state = 123 ,  num_leaves = 750 , learning_rate = 0.01, max_bin = 1200 , n_estimators = 1000)
lgbm.fit(train_final , y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.01, max_bin=1200,
              max_depth=-1, min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=1000, n_jobs=-1, num_leaves=750,
              objective=None, random_state=123, reg_alpha=0.0, reg_lambda=0.0,
              silent=True, subsample=1.0, subsample_for_bin=200000,
              subsample_freq=0)

In [22]:
#Fitting xgboost regressor model on the train data
from xgboost import XGBRFRegressor
xgb = XGBRegressor(random_state = 123 , max_depth = 7 , learning_rate = 0.2 , n_estimators = 1500)
xgb.fit(train_final , y)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.2, max_delta_step=0,
             max_depth=7, min_child_weight=1, missing=None, n_estimators=1500,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=123,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [23]:
#Fitting random forest regressor model on the train data
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 123 , max_depth = 45 , n_estimators = 600)
rf.fit(train_final , y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=45, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=600, n_jobs=None, oob_score=False,
                      random_state=123, verbose=0, warm_start=False)

In [None]:
#Fitting catboost regressor model on the train data
cat = CatBoostRegressor(random_state = 123  , max_depth = 14 )
cat.fit(train_final , y)

Learning rate set to 0.066066
0:	learn: 1.5436341	total: 1.94s	remaining: 32m 14s
1:	learn: 1.4990806	total: 3.58s	remaining: 29m 48s
2:	learn: 1.4561219	total: 5.24s	remaining: 29m 1s
3:	learn: 1.4173067	total: 6.92s	remaining: 28m 44s
4:	learn: 1.3821112	total: 8.55s	remaining: 28m 22s
5:	learn: 1.3491819	total: 10.2s	remaining: 28m 8s
6:	learn: 1.3184880	total: 11.8s	remaining: 27m 57s
7:	learn: 1.2879364	total: 13.5s	remaining: 27m 49s
8:	learn: 1.2625607	total: 15.1s	remaining: 27m 41s
9:	learn: 1.2397270	total: 16.7s	remaining: 27m 35s
10:	learn: 1.2182291	total: 18.4s	remaining: 27m 31s
11:	learn: 1.1979322	total: 20s	remaining: 27m 27s
12:	learn: 1.1774269	total: 21.7s	remaining: 27m 23s
13:	learn: 1.1610312	total: 23.3s	remaining: 27m 21s
14:	learn: 1.1448515	total: 25s	remaining: 27m 18s
15:	learn: 1.1266757	total: 26.6s	remaining: 27m 15s
16:	learn: 1.1105187	total: 28.2s	remaining: 27m 13s
17:	learn: 1.0976657	total: 29.9s	remaining: 27m 10s
18:	learn: 1.0830383	total: 31.6

<catboost.core.CatBoostRegressor at 0x7f0398d9f9d0>

#### Prediction

In [None]:
#Combine the prediction of all the model
#Weighatages are assigned to model based on trial and error method
ypred = (( 0.05 * rf.predict(test_final) + 0.1 * xgb.predict(test_final) + 0.45 *  etr.predict(test_final) +
          0.1 * cat.predict(test_final) +
          0.3 * lgbm.predict(test_final) ))

In [None]:
#Dump the prediction output
ypred1 = pd.DataFrame( np.exp(ypred) )
ypred1.columns = ['Price']
ypred1.to_csv('final_submission.csv',index = False)