In [1]:
import numpy as np
import pandas as pd

In [None]:
ahmd_traffic = pd.read_excel("data/Events Report of Drivers in Ahmedabad_16-07 to 31-07.xlsx")

In [None]:
ahmd_traffic.head()

#### Droping "Event Value" and "Event Unit" as there would be information leakage for predicting "Event Name" eg, if event units is seconds only idling could be the case. Also getting rid of other less useful columns

In [None]:
ahmd_traffic = ahmd_traffic.drop(ahmd_traffic.columns[[2,3,6,8,9]], axis = 1)
ahmd_traffic.head()

#### Creating new variables, weekday and hour of the day based on timestamp column, "Event Time"

In [None]:
ahmd_traffic['Event Time'] =  pd.to_datetime(ahmd_traffic['Event Time'], format='%d.%m.%Y %H:%M:%S')
ahmd_traffic['weekday'] = ahmd_traffic['Event Time'].dt.dayofweek
ahmd_traffic['hour'] = ahmd_traffic['Event Time'].dt.hour
ahmd_traffic = ahmd_traffic.drop(ahmd_traffic.columns[[1]], axis = 1)

days = {0:'Mon',1:'Tues',2:'Weds',3:'Thurs',4:'Fri',5:'Sat',6:'Sun'}

ahmd_traffic['weekday'] = ahmd_traffic['weekday'].apply(lambda x: days[x])
ahmd_traffic.head()

#### Converting categorial variables into integers (label encoding before embedding operation)

In [None]:
from sklearn import preprocessing # Label Encoding: converting text labels into the integers
from sklearn.externals import joblib #for saving label encoder, we will need it later

event_name_le = preprocessing.LabelEncoder()
route_name_le = preprocessing.LabelEncoder()
stop_name_le = preprocessing.LabelEncoder()
weekday_le = preprocessing.LabelEncoder()


event_name_le.fit(ahmd_traffic['Event Name'])
ahmd_traffic['Event Name'] = event_name_le.transform(ahmd_traffic['Event Name'])

route_name_le.fit(ahmd_traffic['Route Name'])
ahmd_traffic['Route Name'] = route_name_le.transform(ahmd_traffic['Route Name'])

stop_name_le.fit(ahmd_traffic['Stop Name'])
ahmd_traffic['Stop Name'] = stop_name_le.transform(ahmd_traffic['Stop Name'])

weekday_le.fit(ahmd_traffic['weekday'])
ahmd_traffic['weekday'] = weekday_le.transform(ahmd_traffic['weekday'])

joblib.dump(event_name_le, 'data/label_encoders/event_name_le.pkl')
joblib.dump(route_name_le, 'data/label_encoders/route_name_le.pkl')
joblib.dump(stop_name_le, 'data/label_encoders/stop_name_le.pkl')
joblib.dump(weekday_le, 'data/label_encoders/weekday_le.pkl')

#### Saving processed dataframe

In [None]:
ahmd_traffic.to_csv("data/ahmd_traffic_clean_df.csv", index = False)
ahmd_traffic.head()