## table 6 - volume training consists of many nan values under vehicle type label. We implement random forest classifier to treat the nan values

## Import section

In [2]:
import pandas as pd
import numpy as np

## load table 6 (volume training table)

In [3]:
df = pd.read_csv("data/phase1_training/volume_training_phase1_table6.csv")

In [4]:
df.head()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type
0,2016-09-19 23:09:25,2,0,1,0,
1,2016-09-19 23:11:53,2,0,1,0,
2,2016-09-19 23:13:54,2,0,1,0,
3,2016-09-19 23:17:48,1,0,1,1,
4,2016-09-19 23:16:07,2,0,1,0,


## split nan and non nan values vehicle_type column into two separate dataframes

In [38]:
df_train = df[df.vehicle_type.notnull()]
df_test = df[df.vehicle_type.isnull()]

In [39]:
df_train.count()

time             212710
tollgate_id      212710
direction        212710
vehicle_model    212710
has_etc          212710
vehicle_type     212710
dtype: int64

In [40]:
df_test.count()

time             330989
tollgate_id      330989
direction        330989
vehicle_model    330989
has_etc          330989
vehicle_type          0
dtype: int64

## store time in separate data frame

In [41]:
store_train_time = pd.DataFrame(columns=["time"])
store_train_time = df_train.time
store_train_time.head()

330989    2016-09-19 23:18:06
330990    2016-09-19 22:31:52
330991    2016-09-19 22:21:18
330992    2016-09-19 21:21:37
330993    2016-09-19 23:22:09
Name: time, dtype: object

In [42]:
type(store_train_time)

pandas.core.series.Series

In [43]:
store_test_time = pd.DataFrame(columns=["time"])
store_test_time = df_test.time
store_test_time.head()

0    2016-09-19 23:09:25
1    2016-09-19 23:11:53
2    2016-09-19 23:13:54
3    2016-09-19 23:17:48
4    2016-09-19 23:16:07
Name: time, dtype: object

## split the non nan dataframe into train and test 

In [44]:
from sklearn.model_selection import train_test_split

In [53]:
X = df_train.drop(['time','vehicle_type'], axis=1)
y = df_train.vehicle_type

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

## implementing random forest classifier to handle missing (nan) values

In [55]:
from sklearn.ensemble import RandomForestClassifier

In [56]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [57]:
rfc_pred = rfc.predict(X_test)

## evaluating performance of the model

In [5]:
from sklearn.metrics import classification_report,confusion_matrix

In [59]:
print(classification_report(y_test,rfc_pred))

              precision    recall  f1-score   support

         0.0       0.87      0.99      0.92     49479
         1.0       0.92      0.47      0.62     14334

    accuracy                           0.87     63813
   macro avg       0.89      0.73      0.77     63813
weighted avg       0.88      0.87      0.86     63813



In [60]:
print(confusion_matrix(y_test,rfc_pred))

[[48860   619]
 [ 7582  6752]]


## using the model to predict the nan values 

In [130]:
df_time = df_test.time

In [62]:
test_x = df_test.drop(["time", "vehicle_type"], axis=1)

In [64]:
test_pred = rfc.predict(test_x)

In [65]:
df_test = df_test.drop("vehicle_type", axis=1)

In [66]:
df_test.insert(5, "vehicle_type", test_pred)

In [67]:
df_test.count()

time             330989
tollgate_id      330989
direction        330989
vehicle_model    330989
has_etc          330989
vehicle_type     330989
dtype: int64

In [68]:
df_test.head()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type
0,2016-09-19 23:09:25,2,0,1,0,0.0
1,2016-09-19 23:11:53,2,0,1,0,0.0
2,2016-09-19 23:13:54,2,0,1,0,0.0
3,2016-09-19 23:17:48,1,0,1,1,0.0
4,2016-09-19 23:16:07,2,0,1,0,0.0


In [69]:
df_train.head()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type
330989,2016-09-19 23:18:06,1,1,1,1,0.0
330990,2016-09-19 22:31:52,3,1,1,1,0.0
330991,2016-09-19 22:21:18,1,1,1,1,0.0
330992,2016-09-19 21:21:37,1,1,2,1,0.0
330993,2016-09-19 23:22:09,1,1,1,1,0.0


## combining the predicted values with prior complete data into a dataframe and writing it onto a csv

In [70]:
result = pd.concat([df_train, df_test])

In [71]:
result.to_csv(path_or_buf="data/phase1_training/volume_training_phase1_table6_nan_fixed.csv")

In [72]:
check = pd.read_csv("data/phase1_training/volume_training_phase1_table6_nan_fixed.csv")

In [73]:
check.count()

Unnamed: 0       543699
time             543699
tollgate_id      543699
direction        543699
vehicle_model    543699
has_etc          543699
vehicle_type     543699
dtype: int64

In [74]:
check.head()

Unnamed: 0.1,Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type
0,330989,2016-09-19 23:18:06,1,1,1,1,0.0
1,330990,2016-09-19 22:31:52,3,1,1,1,0.0
2,330991,2016-09-19 22:21:18,1,1,1,1,0.0
3,330992,2016-09-19 21:21:37,1,1,2,1,0.0
4,330993,2016-09-19 23:22:09,1,1,1,1,0.0
