## Load the required libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
from sklearn import datasets 
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.offline as pyoff
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
import squarify
%matplotlib inline

## Read the files in memory

In [4]:
vehicle_data = pd.read_csv('Train_Vehicletravellingdata.csv', encoding='utf-8')
weather_data = pd.read_csv('Train_WeatherData.csv', encoding='utf-8')
train_data = pd.read_csv('Train.csv', encoding='utf-8')


In [5]:
# On test data
vehicle_data_test = pd.read_csv('Test_Vehicletravellingdata.csv', encoding='utf-8')
weather_data_test = pd.read_csv('Test_WeatherData.csv', encoding='utf-8')
test_data = pd.read_csv('Test.csv', encoding='utf-8')


#### Redefining Column Names

In [6]:
vehicle_data.columns = ['ID','date_time','lane','vehicle_speed','pre_vehicle_ID','pre_vehicle_speed','pre_vehicle_wt','pre_vehicle_len','pre_vehicle_time_gap','road_condition']
weather_data.columns = ['ID','date_time','air_temp','precipitation_type','precipitation_intensity','relative_humidity','wind_direction','wind_speed','lighting_condition']
train_data.columns = ['ID','vehicle_len','vehicle_wt','num_axle','DrivingStyle']

In [7]:
vehicle_data_test.columns = ['ID','date_time','lane','vehicle_speed','pre_vehicle_ID','pre_vehicle_speed','pre_vehicle_wt','pre_vehicle_len','pre_vehicle_time_gap','road_condition']
weather_data_test.columns = ['ID','date_time','air_temp','precipitation_type','precipitation_intensity','relative_humidity','wind_direction','wind_speed','lighting_condition']
test_data.columns = ['ID','vehicle_len','vehicle_wt','num_axle']


## EDA

In [8]:
vehicle_data.head()

Unnamed: 0,ID,date_time,lane,vehicle_speed,pre_vehicle_ID,pre_vehicle_speed,pre_vehicle_wt,pre_vehicle_len,pre_vehicle_time_gap,road_condition
0,DR_24526,2012-03-21 09:14:55,1,81,57227,87,16986,941,94.0,Dry
1,DR_24526,2012-03-21 09:15:07,1,88,57229,81,1708,551,11.0,Dry
2,DR_24526,2012-03-21 09:15:12,1,88,57230,88,22892,1698,4.0,Dry
3,DR_24526,2012-03-21 09:15:48,2,84,57228,89,1945,544,127.0,Dry
4,DR_24526,2012-03-21 09:15:54,1,89,57231,88,13787,1893,42.0,Dry


In [9]:
weather_data.head()

Unnamed: 0,ID,date_time,air_temp,precipitation_type,precipitation_intensity,relative_humidity,wind_direction,wind_speed,lighting_condition
0,DR_24526,2012-03-21 09:14:55,7.0,clear,,95.0,146.0,1.0,daylight
1,DR_24526,2012-03-21 09:15:07,7.0,clear,,95.0,124.0,0.0,daylight
2,DR_24526,2012-03-21 09:15:12,7.0,clear,,95.0,124.0,0.0,daylight
3,DR_24526,2012-03-21 09:15:48,7.0,clear,,95.0,124.0,0.0,daylight
4,DR_24526,2012-03-21 09:15:54,7.0,clear,,95.0,124.0,0.0,daylight


In [10]:
train_data.head()

Unnamed: 0,ID,vehicle_len,vehicle_wt,num_axle,DrivingStyle
0,DR_24526,1265,10243,2,2
1,DR_30052,1680,30871,6,2
2,DR_40928,936,3224,4,2
3,DR_66033,1503,3315,4,2
4,DR_45266,929,3022,4,2


## Data Preprocessing

#### Observation
1. ID and Date Time appear to be the common collumns between vehicle_data and weather_data and ID to be between train_data and the other 2 files
2. Expanding the data as aggregation might cause loss of information since same ID have different values multiple columns

#### Combining the files into a single dataframe

In [11]:
vehicle_and_weather_data = pd.merge(vehicle_data, weather_data, how='inner', on=['ID','date_time'])
combined_data = pd.merge(vehicle_and_weather_data,train_data, how='outer',on=['ID'])

In [12]:
# on test data
vehicle_and_weather_data_test = pd.merge(vehicle_data_test, weather_data_test, how='inner', on=['ID','date_time'])
combined_data_test = pd.merge(vehicle_and_weather_data_test,test_data, how='outer',on=['ID'])

In [13]:
combined_data.dtypes

ID                          object
date_time                   object
lane                         int64
vehicle_speed                int64
pre_vehicle_ID               int64
pre_vehicle_speed            int64
pre_vehicle_wt               int64
pre_vehicle_len              int64
pre_vehicle_time_gap       float64
road_condition              object
air_temp                   float64
precipitation_type          object
precipitation_intensity     object
relative_humidity          float64
wind_direction             float64
wind_speed                 float64
lighting_condition          object
vehicle_len                  int64
vehicle_wt                   int64
num_axle                     int64
DrivingStyle                 int64
dtype: object

In [14]:
combined_data_test.dtypes

ID                          object
date_time                   object
lane                         int64
vehicle_speed                int64
pre_vehicle_ID               int64
pre_vehicle_speed            int64
pre_vehicle_wt               int64
pre_vehicle_len              int64
pre_vehicle_time_gap       float64
road_condition              object
air_temp                   float64
precipitation_type          object
precipitation_intensity     object
relative_humidity          float64
wind_direction             float64
wind_speed                 float64
lighting_condition          object
vehicle_len                  int64
vehicle_wt                   int64
num_axle                     int64
dtype: object

In [15]:
combined_data.shape

(162566, 21)

In [16]:
combined_data_test.shape

(61671, 20)

In [17]:
combined_data.isnull().sum()

ID                            0
date_time                     0
lane                          0
vehicle_speed                 0
pre_vehicle_ID                0
pre_vehicle_speed             0
pre_vehicle_wt                0
pre_vehicle_len               0
pre_vehicle_time_gap       2455
road_condition                0
air_temp                   2057
precipitation_type            0
precipitation_intensity       0
relative_humidity          2105
wind_direction             2114
wind_speed                 2464
lighting_condition            0
vehicle_len                   0
vehicle_wt                    0
num_axle                      0
DrivingStyle                  0
dtype: int64

In [18]:
combined_data_test.isnull().sum()

ID                           0
date_time                    0
lane                         0
vehicle_speed                0
pre_vehicle_ID               0
pre_vehicle_speed            0
pre_vehicle_wt               0
pre_vehicle_len              0
pre_vehicle_time_gap       956
road_condition               0
air_temp                   934
precipitation_type           0
precipitation_intensity      0
relative_humidity          943
wind_direction             964
wind_speed                 983
lighting_condition           0
vehicle_len                  0
vehicle_wt                   0
num_axle                     0
dtype: int64

#### Categorical and Numerical Data Split

In [19]:
def split_cat_num_cols(data):
    cat_cols =[]
    num_cols =[]
    for i in data.columns:
        # assuming categorical data has less than 10 unique values
        # for this dataset the assumption holds good
        if len(np.unique(data[i])) < 10:
            cat_cols.append(i)
            data[i] = data[i].astype('category')
        else:           
            num_cols.append(i)
            if i != 'ID' and i != 'date_time' and i != 'pre_vehicle_ID':
                data[i] = data[i].astype('float')
    return cat_cols, num_cols

In [20]:
cat_cols,num_cols = split_cat_num_cols(combined_data)

In [21]:
cat_cols

['lane',
 'road_condition',
 'precipitation_type',
 'precipitation_intensity',
 'lighting_condition',
 'num_axle',
 'DrivingStyle']

In [22]:
num_cols

['ID',
 'date_time',
 'vehicle_speed',
 'pre_vehicle_ID',
 'pre_vehicle_speed',
 'pre_vehicle_wt',
 'pre_vehicle_len',
 'pre_vehicle_time_gap',
 'air_temp',
 'relative_humidity',
 'wind_direction',
 'wind_speed',
 'vehicle_len',
 'vehicle_wt']

In [23]:
combined_data.dtypes

ID                           object
date_time                    object
lane                       category
vehicle_speed               float64
pre_vehicle_ID                int64
pre_vehicle_speed           float64
pre_vehicle_wt              float64
pre_vehicle_len             float64
pre_vehicle_time_gap        float64
road_condition             category
air_temp                    float64
precipitation_type         category
precipitation_intensity    category
relative_humidity           float64
wind_direction              float64
wind_speed                  float64
lighting_condition         category
vehicle_len                 float64
vehicle_wt                  float64
num_axle                   category
DrivingStyle               category
dtype: object

#### Drop columns which have no significance

In [24]:
combined_data = combined_data.drop(['date_time','pre_vehicle_ID'],axis=1)

In [25]:
combined_data.shape

(162566, 19)

In [26]:
combined_data_test = combined_data_test.drop(['date_time','pre_vehicle_ID'],axis=1)

In [27]:
combined_data_test.shape

(61671, 18)

#### Impute NA with 0 for Numerical

In [28]:
num_cols.remove('date_time')

In [29]:
num_cols.remove('pre_vehicle_ID')

In [30]:
num_cols

['ID',
 'vehicle_speed',
 'pre_vehicle_speed',
 'pre_vehicle_wt',
 'pre_vehicle_len',
 'pre_vehicle_time_gap',
 'air_temp',
 'relative_humidity',
 'wind_direction',
 'wind_speed',
 'vehicle_len',
 'vehicle_wt']

In [31]:
combined_data_num = combined_data[num_cols].replace(np.NaN,0)

#### Impute NA with Unknow for Categorcal 

In [32]:
combined_data_cat = combined_data[cat_cols].replace(np.NaN,'Unknown')

In [33]:
combined_data[num_cols] = combined_data_num[num_cols]
combined_data[cat_cols] = combined_data_cat[cat_cols]

#### Imputing for Test

In [34]:
combined_data_test_num = combined_data_test[num_cols].replace(np.NaN,0)

In [35]:
cat_cols_test = cat_cols
cat_cols_test.remove('DrivingStyle')
combined_data_test_cat = combined_data_test[cat_cols_test].replace(np.NaN,'Unknown')

In [36]:
combined_data_test[num_cols] = combined_data_test_num[num_cols]
combined_data_test[cat_cols] = combined_data_test_cat[cat_cols]

In [37]:
combined_data.head()

Unnamed: 0,ID,lane,vehicle_speed,pre_vehicle_speed,pre_vehicle_wt,pre_vehicle_len,pre_vehicle_time_gap,road_condition,air_temp,precipitation_type,precipitation_intensity,relative_humidity,wind_direction,wind_speed,lighting_condition,vehicle_len,vehicle_wt,num_axle,DrivingStyle
0,DR_24526,1,81.0,87.0,16986.0,941.0,94.0,Dry,7.0,clear,,95.0,146.0,1.0,daylight,1265.0,10243.0,2,2
1,DR_24526,1,88.0,81.0,1708.0,551.0,11.0,Dry,7.0,clear,,95.0,124.0,0.0,daylight,1265.0,10243.0,2,2
2,DR_24526,1,88.0,88.0,22892.0,1698.0,4.0,Dry,7.0,clear,,95.0,124.0,0.0,daylight,1265.0,10243.0,2,2
3,DR_24526,2,84.0,89.0,1945.0,544.0,127.0,Dry,7.0,clear,,95.0,124.0,0.0,daylight,1265.0,10243.0,2,2
4,DR_24526,1,89.0,88.0,13787.0,1893.0,42.0,Dry,7.0,clear,,95.0,124.0,0.0,daylight,1265.0,10243.0,2,2


In [38]:
combined_data_test.head()

Unnamed: 0,ID,lane,vehicle_speed,pre_vehicle_speed,pre_vehicle_wt,pre_vehicle_len,pre_vehicle_time_gap,road_condition,air_temp,precipitation_type,precipitation_intensity,relative_humidity,wind_direction,wind_speed,lighting_condition,vehicle_len,vehicle_wt,num_axle
0,DR_79986,1,87,94,2511,547,83.0,Dry,8.0,clear,,95.0,174.0,4.0,daylight,574,1268,2
1,DR_79986,1,85,87,15630,1821,2.0,Dry,8.0,clear,,95.0,174.0,4.0,daylight,574,1268,2
2,DR_79986,2,86,78,14443,2048,175.0,Dry,8.0,clear,,95.0,174.0,4.0,daylight,574,1268,2
3,DR_79986,2,113,86,1615,557,8.0,Dry,8.0,clear,,95.0,174.0,4.0,daylight,574,1268,2
4,DR_79986,1,92,85,8800,1233,88.0,Dry,8.0,clear,,95.0,174.0,4.0,daylight,574,1268,2


In [39]:
combined_data.shape

(162566, 19)

In [40]:
combined_data_test.shape

(61671, 18)

#### Aggregate Numerical and Categorical 
Categorical - Take Mode

In [41]:
combined_data_cat_aggregated = combined_data.groupby(['ID'], sort=False)['ID','lane',
 'road_condition',
 'precipitation_type',
 'precipitation_intensity',
 'lighting_condition',
 'num_axle',
 'DrivingStyle'].max()

In [42]:
combined_data_cat_aggregated.head()

Unnamed: 0_level_0,ID,lane,road_condition,precipitation_type,precipitation_intensity,lighting_condition,num_axle,DrivingStyle
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DR_24526,DR_24526,2,Dry,clear,,daylight,2,2
DR_30052,DR_30052,2,Dry,clear,,daylight,6,2
DR_40928,DR_40928,2,Dry,rain,Low,daylight,4,2
DR_66033,DR_66033,2,Dry,clear,,daylight,4,2
DR_45266,DR_45266,2,Dry,clear,,daylight,4,2


In [43]:
combined_data_cat_aggregated.columns

Index(['ID', 'lane', 'road_condition', 'precipitation_type',
       'precipitation_intensity', 'lighting_condition', 'num_axle',
       'DrivingStyle'],
      dtype='object')

In [44]:
combined_data_cat_aggregated.count()

ID                         12994
lane                       12994
road_condition             12994
precipitation_type         12994
precipitation_intensity    12994
lighting_condition         12994
num_axle                   12994
DrivingStyle               12994
dtype: int64

#### Numerical - Take Mean

In [45]:
combined_data_num_aggregated = combined_data.groupby(['ID'])[ 
 'vehicle_speed',
 'pre_vehicle_speed',
 'pre_vehicle_wt',
 'pre_vehicle_len',
 'pre_vehicle_time_gap',
 'air_temp',
 'relative_humidity',
 'wind_direction',
 'wind_speed',
 'vehicle_len',
 'vehicle_wt'].mean()

In [46]:
combined_data_num_aggregated.count()

vehicle_speed           12994
pre_vehicle_speed       12994
pre_vehicle_wt          12994
pre_vehicle_len         12994
pre_vehicle_time_gap    12994
air_temp                12994
relative_humidity       12994
wind_direction          12994
wind_speed              12994
vehicle_len             12994
vehicle_wt              12994
dtype: int64

In [47]:
combined_data_num_aggregated.head()

Unnamed: 0_level_0,vehicle_speed,pre_vehicle_speed,pre_vehicle_wt,pre_vehicle_len,pre_vehicle_time_gap,air_temp,relative_humidity,wind_direction,wind_speed,vehicle_len,vehicle_wt
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
DR_10002,84.4,77.0,4200.4,762.0,234.8,2.2,66.6,173.2,5.8,552.0,2027.0
DR_10013,86.555556,85.777778,9517.888889,1184.888889,173.111111,4.0,58.0,163.0,8.0,1709.0,18599.0
DR_10023,83.125,81.625,10180.1875,1022.3125,61.4375,6.0625,34.0625,205.875,3.0625,552.0,1289.0
DR_10024,86.666667,81.5,6366.333333,1069.666667,194.166667,0.0,0.0,0.0,0.0,1768.0,14588.0
DR_10025,81.5,82.5,10157.875,1113.625,156.125,1.0,87.0,11.0,3.0,1734.0,16708.0


#### Aggregate on Test Data

In [48]:
combined_data_cat_test_aggregated = combined_data_test.groupby(['ID'], sort=False)['ID','lane',
 'road_condition',
 'precipitation_type',
 'precipitation_intensity',
 'lighting_condition',
 'num_axle'].max()

In [49]:
combined_data_num_test_aggregated = combined_data_test.groupby(['ID'])[ 
 'vehicle_speed',
 'pre_vehicle_speed',
 'pre_vehicle_wt',
 'pre_vehicle_len',
 'pre_vehicle_time_gap',
 'air_temp',
 'relative_humidity',
 'wind_direction',
 'wind_speed',
 'vehicle_len',
 'vehicle_wt'].mean()

#### Merging Aggregated Data

In [50]:
combined_data_aggregated = combined_data_num_aggregated.copy()
combined_data_aggregated[combined_data_cat_aggregated.columns] = combined_data_cat_aggregated[combined_data_cat_aggregated.columns]


In [51]:
combined_data_aggregated.shape

(12994, 19)

In [52]:
combined_data_aggregated.reset_index(drop=True, inplace=True)

In [53]:
combined_data_aggregated.head()

Unnamed: 0,vehicle_speed,pre_vehicle_speed,pre_vehicle_wt,pre_vehicle_len,pre_vehicle_time_gap,air_temp,relative_humidity,wind_direction,wind_speed,vehicle_len,vehicle_wt,ID,lane,road_condition,precipitation_type,precipitation_intensity,lighting_condition,num_axle,DrivingStyle
0,84.4,77.0,4200.4,762.0,234.8,2.2,66.6,173.2,5.8,552.0,2027.0,DR_10002,2,Dry,clear,,daylight,2,2
1,86.555556,85.777778,9517.888889,1184.888889,173.111111,4.0,58.0,163.0,8.0,1709.0,18599.0,DR_10013,2,Dry,clear,,night,6,3
2,83.125,81.625,10180.1875,1022.3125,61.4375,6.0625,34.0625,205.875,3.0625,552.0,1289.0,DR_10023,2,Dry,clear,,night,2,2
3,86.666667,81.5,6366.333333,1069.666667,194.166667,0.0,0.0,0.0,0.0,1768.0,14588.0,DR_10024,2,Dry,snow,,twilight,5,1
4,81.5,82.5,10157.875,1113.625,156.125,1.0,87.0,11.0,3.0,1734.0,16708.0,DR_10025,2,Wet,clear,,night,5,3


#### Merge aggregated data for test

In [54]:
combined_data_test_aggregated = combined_data_num_test_aggregated.copy()
combined_data_test_aggregated[combined_data_cat_test_aggregated.columns] = combined_data_cat_test_aggregated[combined_data_cat_test_aggregated.columns]


In [55]:
combined_data_test_aggregated.shape

(4880, 18)

In [56]:
combined_data_test_aggregated.reset_index(drop=True, inplace=True)

In [57]:
combined_data_test_aggregated.head()

Unnamed: 0,vehicle_speed,pre_vehicle_speed,pre_vehicle_wt,pre_vehicle_len,pre_vehicle_time_gap,air_temp,relative_humidity,wind_direction,wind_speed,vehicle_len,vehicle_wt,ID,lane,road_condition,precipitation_type,precipitation_intensity,lighting_condition,num_axle
0,81.928571,82.5,5102.714286,725.0,93.642857,1.0,57.5,157.5,5.5,548.0,2467.0,DR_75256,2,Dry,clear,,daylight,2
1,84.76,85.6,4215.36,760.16,50.52,1.88,67.0,17.84,3.24,594.0,1794.0,DR_75257,2,Dry,clear,,night,2
2,90.5,93.5,1702.5,641.75,302.25,2.5,90.5,194.5,5.0,825.0,5885.0,DR_75260,2,Wet,rain,Low,daylight,2
3,91.0,92.272727,3861.545455,792.818182,60.681818,5.5,50.0,146.5,5.5,567.0,1655.0,DR_75272,2,Dry,clear,,daylight,2
4,85.0,86.0,13110.333333,1437.666667,202.333333,5.0,95.0,191.0,5.0,1993.0,19417.0,DR_75274,1,Wet,clear,,night,6


#### check the No of Values in each Level of Target Variable

In [407]:
combined_data_aggregated.DrivingStyle.value_counts()

2    6422
3    3798
1    2774
Name: DrivingStyle, dtype: int64

#### Dummify the Categorical Variables

In [409]:
target = combined_data_aggregated["DrivingStyle"]
combined_data_independant = combined_data_aggregated.drop(["DrivingStyle"],axis=1)

In [410]:
combined_data_dummified = pd.get_dummies(combined_data_independant)

In [411]:
combined_data_test_dummified = pd.get_dummies(combined_data_test_aggregated)

In [412]:
combined_data_dummified.head()

Unnamed: 0,vehicle_speed,pre_vehicle_speed,pre_vehicle_wt,pre_vehicle_len,pre_vehicle_time_gap,air_temp,relative_humidity,wind_direction,wind_speed,vehicle_len,...,precipitation_type_rain,precipitation_type_snow,precipitation_intensity_,precipitation_intensity_High,precipitation_intensity_Low,precipitation_intensity_Moderate,precipitation_intensity_None,lighting_condition_daylight,lighting_condition_night,lighting_condition_twilight
0,84.4,77.0,4200.4,762.0,234.8,2.2,66.6,173.2,5.8,552.0,...,0,0,0,0,0,0,1,1,0,0
1,86.555556,85.777778,9517.888889,1184.888889,173.111111,4.0,58.0,163.0,8.0,1709.0,...,0,0,0,0,0,0,1,0,1,0
2,83.125,81.625,10180.1875,1022.3125,61.4375,6.0625,34.0625,205.875,3.0625,552.0,...,0,0,0,0,0,0,1,0,1,0
3,86.666667,81.5,6366.333333,1069.666667,194.166667,0.0,0.0,0.0,0.0,1768.0,...,0,1,1,0,0,0,0,0,0,1
4,81.5,82.5,10157.875,1113.625,156.125,1.0,87.0,11.0,3.0,1734.0,...,0,0,0,0,0,0,1,0,1,0


In [413]:
combined_data_test_dummified.head()

Unnamed: 0,vehicle_speed,pre_vehicle_speed,pre_vehicle_wt,pre_vehicle_len,pre_vehicle_time_gap,air_temp,relative_humidity,wind_direction,wind_speed,vehicle_len,...,precipitation_type_rain,precipitation_type_snow,precipitation_intensity_,precipitation_intensity_High,precipitation_intensity_Low,precipitation_intensity_Moderate,precipitation_intensity_None,lighting_condition_daylight,lighting_condition_night,lighting_condition_twilight
0,81.928571,82.5,5102.714286,725.0,93.642857,1.0,57.5,157.5,5.5,548.0,...,0,0,0,0,0,0,1,1,0,0
1,84.76,85.6,4215.36,760.16,50.52,1.88,67.0,17.84,3.24,594.0,...,0,0,0,0,0,0,1,0,1,0
2,90.5,93.5,1702.5,641.75,302.25,2.5,90.5,194.5,5.0,825.0,...,1,0,0,0,1,0,0,1,0,0
3,91.0,92.272727,3861.545455,792.818182,60.681818,5.5,50.0,146.5,5.5,567.0,...,0,0,0,0,0,0,1,1,0,0
4,85.0,86.0,13110.333333,1437.666667,202.333333,5.0,95.0,191.0,5.0,1993.0,...,0,0,0,0,0,0,1,0,1,0


## Model Building

#### Train - Test Split
Stratified Split as there is imbalance in Target Variable

In [414]:
x_train, x_validation, y_train, y_validation = train_test_split(combined_data_dummified, target, stratify=target, test_size=0.3)


### Logistic Regression

In [41]:
from sklearn.linear_model import LogisticRegression

In [43]:
lr = LogisticRegression(solver='newton-cg', multi_class='multinomial')
lr.fit(x_train, y_train)


newton-cg failed to converge. Increase the number of iterations.



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

### Decision Tree

In [51]:
from sklearn.model_selection import GridSearchCV
from sklearn import tree

In [53]:
parameters = {'max_depth':range(10,50)}

clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters, n_jobs=3,cv=5)
clf.fit(X=combined_data_dummified, y=target)
tree_model = clf.best_estimator_

print (clf.best_score_, clf.best_params_) 

0.655937846836848 {'max_depth': 10}


### Random forest

In [415]:
from sklearn.ensemble import RandomForestClassifier

In [416]:
# rf = RandomForestClassifier(n_estimators = 2000, n_jobs=-1,random_state = 69, max_features='sqrt',max_depth=100,bootstrap=True,min_samples_leaf=2)
rf = RandomForestClassifier(n_estimators = 2000, n_jobs=-1,random_state = 69, max_features='sqrt',max_depth=100)

In [417]:
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=-1,
            oob_score=False, random_state=69, verbose=0, warm_start=False)

In [None]:
# # Grid Serach RandomForest
# param_grid = { 
#     'n_estimators': [1500,2000],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth' : [30,50,100],
#     'criterion' :['gini', 'entropy']
# }

# clf_rf = GridSearchCV(rf, param_grid, n_jobs=4,cv=5)
# clf_rf.fit(X=combined_data_dummified, y=target)
# print (clf_rf.best_score_, clf_rf.best_params_) 

#### Random Search

In [458]:
# from sklearn.model_selection import RandomizedSearchCV
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 1000, stop = 2000, num = 5)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(50, 110, num = 5)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]
# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}
# print(random_grid)

{'n_estimators': [1000, 1250, 1500, 1750, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [50, 65, 80, 95, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [459]:
# # Use the random grid to search for best hyperparameters
# # First create the base model to tune
# rf = RandomForestClassifier()
# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, cv = 3, verbose=2, random_state=69, n_jobs = -1)
# # Fit the random search model
# rf_random.fit(combined_data_dummified, target)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] n_estimators=1750, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=None, bootstrap=False 
[CV] n_estimators=1750, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=None, bootstrap=False 
[CV] n_estimators=1750, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=None, bootstrap=False 
[CV] n_estimators=2000, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=50, bootstrap=False 
[CV]  n_estimators=1750, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=None, bootstrap=False, total=11.5min
[CV] n_estimators=2000, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=50, bootstrap=False 
[CV]  n_estimators=1750, min_samples_split=10, min_samples_leaf=4, max_features=sqrt, max_depth=None, bootstrap=False, total=11.5min
[CV] n_estimators=2000, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_de

[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 45.0min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=5, n_jobs=-1,
          param_distributions={'n_estimators': [1000, 1250, 1500, 1750, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [50, 65, 80, 95, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=69, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

## Predicting

#### Predict on Train

In [44]:
lr_pred_train = lr.predict(x_train)

In [45]:
print(accuracy_score(y_train,lr_pred_train))

0.6479813720871236


In [418]:
rf_pred_train = rf.predict(x_train)

In [419]:
print(accuracy_score(y_train,rf_pred_train)) 

0.9894447498625618


#### Predict on Validation

In [47]:
lr_pred_validation = lr.predict(x_validation)

In [48]:
print(accuracy_score(y_validation,lr_pred_validation)) 

0.6491131640699321


In [420]:
rf_pred_validation = rf.predict(x_validation)

In [421]:
print(accuracy_score(y_validation,rf_pred_validation)) 

0.7183893305975891


In [None]:
depth = []
for i in range(3,30):
    # Perform 7-fold cross validation 
    scores = cross_val_score(estimator=rf, X=x_train, y=y_train, cv=5, n_jobs=1)
    depth.append((i,scores.mean()))
print(depth)

#### Predict on Test

In [424]:
# Get missing columns in the training test
missing_cols = set( combined_data_dummified.columns ) - set( combined_data_test_dummified.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    combined_data_test_dummified[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
combined_data_test_dummified = combined_data_test_dummified[combined_data_dummified.columns]

In [425]:
rf_pred_test = rf.predict(combined_data_test_dummified)

In [426]:
test_sub_data = pd.DataFrame({'DrivingStyle': rf_pred_test})

In [427]:
test_sub_data.shape

(59744, 1)

In [428]:
test_sub_data.isna().sum()

DrivingStyle    0
dtype: int64

In [429]:
test_sub_data.head()

Unnamed: 0,DrivingStyle
0,2
1,2
2,2
3,2
4,2


In [430]:
ids = list(combined_data_test_no_na_with_ID.ID)

In [431]:
len(ids)

59744

In [432]:
test_sub_data['ID'] = ids

In [433]:
test_sub_data.shape

(59744, 2)

In [434]:
test_sub_data.isna().sum()

DrivingStyle    0
ID              0
dtype: int64

In [435]:
test_sub_data.head()

Unnamed: 0,DrivingStyle,ID
0,2,DR_79986
1,2,DR_79986
2,2,DR_79986
3,2,DR_79986
4,2,DR_79986


In [436]:
test_sub_data.columns = ['DrivingStyle','ID']

In [437]:
test_sub_data = test_sub_data[['ID','DrivingStyle']]

In [438]:
test_sub_data.head()

Unnamed: 0,ID,DrivingStyle
0,DR_79986,2
1,DR_79986,2
2,DR_79986,2
3,DR_79986,2
4,DR_79986,2


In [439]:
test_sub_data.isna().sum()

ID              0
DrivingStyle    0
dtype: int64

In [440]:
test_sub_data_grouped = test_sub_data.groupby(['ID','DrivingStyle']).size().to_frame()
test_sub_data_grouped = test_sub_data_grouped.reset_index()
test_sub_data_grouped.columns = ['ID','DrivingStyle','Count']

In [441]:
test_sub_data_grouped.head()

Unnamed: 0,ID,DrivingStyle,Count
0,DR_75256,2,14
1,DR_75257,3,24
2,DR_75260,2,4
3,DR_75272,2,21
4,DR_75274,1,3


In [442]:
test_sub_data_grouped = test_sub_data_grouped.groupby(['ID'], sort=False)['ID','Count','DrivingStyle'].max()

In [443]:
test_sub_data_grouped = test_sub_data_grouped.drop('Count',axis=1)

In [444]:
test_sub_data_grouped.head()

Unnamed: 0_level_0,ID,DrivingStyle
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
DR_75256,DR_75256,2
DR_75257,DR_75257,3
DR_75260,DR_75260,2
DR_75272,DR_75272,2
DR_75274,DR_75274,1


In [445]:
test_sub_data_grouped.reset_index(drop=True, inplace=True)

In [446]:
test_sub_data_grouped.head()

Unnamed: 0,ID,DrivingStyle
0,DR_75256,2
1,DR_75257,3
2,DR_75260,2
3,DR_75272,2
4,DR_75274,1
