In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
import warnings;
from pysqldf import SQLDF
import pandasql as psql
from matplotlib.ticker import FuncFormatter
from sklearn.model_selection import KFold
import sklearn.ensemble as ske
import lightgbm as lgb
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

warnings.filterwarnings('ignore')

In [2]:
train01 = pd.read_csv("C:\\Kaggle\\Cars\\Data\\TrnDataForLGB.csv")
test01 = pd.read_csv("C:\\Kaggle\\Cars\\Data\\TstDataForLGB.csv")
train01['Price'].describe()

count    6019.000000
mean        2.018429
std         0.748221
min         0.364643
25%         1.504077
50%         1.893112
75%         2.393339
max         5.081404
Name: Price, dtype: float64

In [3]:
train01['Price'] = np.exp(train01['Price']) - 1
train01['Price'] = train01['Price']**0.1
train01['Price'].describe()

count    6019.000000
mean        1.204875
std         0.107454
min         0.921182
25%         1.133462
50%         1.188852
75%         1.258295
max         1.661162
Name: Price, dtype: float64

In [4]:
Remove_List = ["id","Price","Name","Lag_Price2_MIN","Lag_Price2_MAX",
               "Lag_Price3_MIN","Lag_Price3_MAX","Lag_Price","Lag_Price3","Engine_Group",
               "Power_Group","TrainTestInd","CarCompName","RateChng1","RateChng2","RateChng3",
               "Lag_Price4_MIN","Lag_Price4_MAX","Lag_Price4_MIN_BY_MAX"]
feature_names = list(set(list(train01.columns)) - set(Remove_List))
feature_names

['CompName_PORSCHE',
 'CompName_SKODA',
 'CompName_FORD',
 'CompName_MAHINDRA',
 'CompNameCarName_Q5',
 'CompName_HONDA',
 'CompNameCarName_ETIOS',
 'CompNameCarName_SCORPIO',
 'Fuel_TypeDiesel',
 'CompName_BMW',
 'CompName_CHEVROLET',
 'CompNameCarName_B',
 'CompNameCarName_MOBILIO',
 'CompName_VOLKSWAGEN',
 'CompNameCarName_ELITE',
 'CompNameCarName_BOLERO',
 'CompName_RENAULT',
 'LocationDelhi',
 'New_Price',
 'Lag_Price2',
 'CompNameCarName_E-CLASS',
 'CompNameCarName_FIESTA',
 'LocationPune',
 'CompNameCarName_SSANGYONG',
 'CompNameCarName_LAURA',
 'LocationKochi',
 'Owner_TypeThird',
 'CompNameCarName_BRIO',
 'CompNameCarName_DUSTER',
 'Year',
 'CompNameCarName_XYLO',
 'Owner_TypeSecond',
 'CompNameCarName_KUV',
 'CompName_JEEP',
 'CompNameCarName_ACCENT',
 'CompNameCarName_OPTRA',
 'CompNameCarName_DZIRE',
 'CompNameCarName_SUNNY',
 'CompNameCarName_A-STAR',
 'CompName_MINI',
 'CompNameCarName_ERTIGA',
 'CompName_DATSUN',
 'CompNameCarName_CR-V',
 'CompNameCarName_CRETA',
 'Comp

In [5]:
train02 = train01.copy(deep=True)
train02.reset_index(drop = True, inplace = True)
kf = KFold(n_splits = 5, shuffle = True, random_state = 100)
kf.get_n_splits(train02)

5

In [15]:
IterationNum = 1
for train_index, test_index in kf.split(train02):
    print("Running CV Iteration Num :", IterationNum)
    MOD_DATA_2_TRAIN, MOD_DATA_2_TEST = train02.iloc[train_index], train02.iloc[test_index]
    
    parameters = {  'objective': 'regression',
                    'metric': 'rmse',
                    'boosting': 'gbdt',
                    'feature_fraction': 0.2,
                    'bagging_fraction': 0.9,
                    'bagging_freq' : 0,
                    'learning_rate': 0.005,
                    'min_data_in_leaf': 2,
                    'max_depth': 4,
                    'seed': 500,
                    'max_bin': 75,
                    'min_data_in_bin': 5,
                    'verbosity': -1,
                    'silent': -1  }
    
    X_TRAIN = pd.DataFrame(MOD_DATA_2_TRAIN[feature_names])
    Y_TRAIN = MOD_DATA_2_TRAIN["Price"]
                
    X_TEST = pd.DataFrame(MOD_DATA_2_TEST[feature_names])
    Y_TEST = MOD_DATA_2_TEST["Price"]
    
    train_data = lgb.Dataset(X_TRAIN,
                             label = Y_TRAIN)
    valid_data = lgb.Dataset(X_TEST,
                             label = Y_TEST)
    lgb_model = lgb.train(parameters,
                          train_data,
                          valid_sets = valid_data,
                          num_boost_round = 10000000,
                          early_stopping_rounds = 400,
                          verbose_eval = 200)
    MOD_DATA_2_TEST['Predicted_Model_Value'] = lgb_model.predict(pd.DataFrame(MOD_DATA_2_TEST[feature_names]))
    
    MOD_DATA_2_TEST["Price"] = np.log((MOD_DATA_2_TEST["Price"]**10)+1)
    MOD_DATA_2_TEST['Predicted_Model_Value'] = np.log((MOD_DATA_2_TEST['Predicted_Model_Value']**10)+1)
    
    if(IterationNum == 1):
        CV_SCORED_DATA = MOD_DATA_2_TEST.copy(deep=True)
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        test_scored = lgb_model.predict(pd.DataFrame(test01[feature_names]))
    else:
        CV_SCORED_DATA = pd.concat([CV_SCORED_DATA,MOD_DATA_2_TEST])
        CV_SCORED_DATA.reset_index(drop = True, inplace = True)
        test_scored = test_scored + lgb_model.predict(pd.DataFrame(test01[feature_names]))
                    
    IterationNum = IterationNum + 1
    
    print("Test RMSE : ",sqrt(mean_squared_error(MOD_DATA_2_TEST["Price"], MOD_DATA_2_TEST['Predicted_Model_Value'])))
    
print("CV RMSE : ",sqrt(mean_squared_error(CV_SCORED_DATA["Price"], CV_SCORED_DATA['Predicted_Model_Value'])))
#CV RMSE :  0.14204806806938736 0.13979503719615177
#LB: 0.9463

Running CV Iteration Num : 1
Training until validation scores don't improve for 400 rounds.
[200]	valid_0's rmse: 0.0616187
[400]	valid_0's rmse: 0.0411833
[600]	valid_0's rmse: 0.0323872
[800]	valid_0's rmse: 0.0285635
[1000]	valid_0's rmse: 0.0264913
[1200]	valid_0's rmse: 0.0251824
[1400]	valid_0's rmse: 0.0242974
[1600]	valid_0's rmse: 0.023698
[1800]	valid_0's rmse: 0.0232132
[2000]	valid_0's rmse: 0.0228078
[2200]	valid_0's rmse: 0.0225097
[2400]	valid_0's rmse: 0.0222466
[2600]	valid_0's rmse: 0.0219975
[2800]	valid_0's rmse: 0.0217572
[3000]	valid_0's rmse: 0.0215397
[3200]	valid_0's rmse: 0.0213411
[3400]	valid_0's rmse: 0.0211763
[3600]	valid_0's rmse: 0.0210183
[3800]	valid_0's rmse: 0.0208797
[4000]	valid_0's rmse: 0.0207497
[4200]	valid_0's rmse: 0.0206242
[4400]	valid_0's rmse: 0.0205275
[4600]	valid_0's rmse: 0.0204206
[4800]	valid_0's rmse: 0.0203353
[5000]	valid_0's rmse: 0.0202459
[5200]	valid_0's rmse: 0.0201587
[5400]	valid_0's rmse: 0.0200846
[5600]	valid_0's rmse:

[22200]	valid_0's rmse: 0.0194894
[22400]	valid_0's rmse: 0.0194861
[22600]	valid_0's rmse: 0.0194839
[22800]	valid_0's rmse: 0.0194805
[23000]	valid_0's rmse: 0.0194768
[23200]	valid_0's rmse: 0.0194758
[23400]	valid_0's rmse: 0.019471
[23600]	valid_0's rmse: 0.0194693
[23800]	valid_0's rmse: 0.0194685
[24000]	valid_0's rmse: 0.0194715
Early stopping, best iteration is:
[23655]	valid_0's rmse: 0.019467
Test RMSE :  0.13257360774070212
Running CV Iteration Num : 3
Training until validation scores don't improve for 400 rounds.
[200]	valid_0's rmse: 0.060507
[400]	valid_0's rmse: 0.041557
[600]	valid_0's rmse: 0.0344496
[800]	valid_0's rmse: 0.0316482
[1000]	valid_0's rmse: 0.0302907
[1200]	valid_0's rmse: 0.0294192
[1400]	valid_0's rmse: 0.0288676
[1600]	valid_0's rmse: 0.028493
[1800]	valid_0's rmse: 0.0282093
[2000]	valid_0's rmse: 0.0279861
[2200]	valid_0's rmse: 0.0277915
[2400]	valid_0's rmse: 0.027626
[2600]	valid_0's rmse: 0.027472
[2800]	valid_0's rmse: 0.0273377
[3000]	valid_0'

[26800]	valid_0's rmse: 0.0199321
[27000]	valid_0's rmse: 0.0199293
[27200]	valid_0's rmse: 0.0199253
[27400]	valid_0's rmse: 0.0199229
[27600]	valid_0's rmse: 0.0199213
[27800]	valid_0's rmse: 0.019917
[28000]	valid_0's rmse: 0.0199132
[28200]	valid_0's rmse: 0.0199104
[28400]	valid_0's rmse: 0.0199055
[28600]	valid_0's rmse: 0.0198989
[28800]	valid_0's rmse: 0.0198953
[29000]	valid_0's rmse: 0.0198912
[29200]	valid_0's rmse: 0.0198902
[29400]	valid_0's rmse: 0.019888
[29600]	valid_0's rmse: 0.0198885
[29800]	valid_0's rmse: 0.019887
[30000]	valid_0's rmse: 0.0198868
[30200]	valid_0's rmse: 0.0198892
Early stopping, best iteration is:
[29902]	valid_0's rmse: 0.0198855
Test RMSE :  0.1373762801199711
Running CV Iteration Num : 5
Training until validation scores don't improve for 400 rounds.
[200]	valid_0's rmse: 0.0622608
[400]	valid_0's rmse: 0.0420566
[600]	valid_0's rmse: 0.0335168
[800]	valid_0's rmse: 0.029921
[1000]	valid_0's rmse: 0.0279789
[1200]	valid_0's rmse: 0.0267348
[1400

In [20]:
from scipy import stats
test_scored2 = test_scored / 5.0
stats.describe(test_scored2)

DescribeResult(nobs=1234, minmax=(0.9229671020236232, 1.5432109503595928), mean=1.2001989522212881, variance=0.0104433286774734, skewness=0.6665188295432515, kurtosis=0.18619406186459653)

In [21]:
test_scored3 = pd.DataFrame({'Price' : (test_scored2**10)})

In [22]:
test_scored3.head()

Unnamed: 0,Price
0,2.892957
1,3.032972
2,17.113367
3,4.261667
4,4.619814


In [23]:
CV_SCORED_DATA.to_csv("C:\\Kaggle\\Cars\\CV_Scored\\20190716_LGB01copy_CVTRAIN_DS.csv",
                      index = False)
test_scored3.to_csv("C:\\Kaggle\\Cars\\Submission\\20190716_LGB01copy_TEST_DS.csv",
                    index = False)