In [None]:
import pandas as pd
import numpy as np
import pickle
import preprocess_tools as tools
import itertools
from sklearn.metrics import precision_score,recall_score,accuracy_score,confusion_matrix, roc_auc_score

In [None]:
# load pretrained model
rf = pickle.load(open('final_model.sav', 'rb'))

In [None]:
# load data to be predicted
df_measurement = pd.read_csv('/Users/grandia/Downloads/safety/features/part-00000-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv')
df_label = pd.read_csv('/Users/grandia/Downloads/safety/labels/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv')

In [None]:
# change data type of bookingID to str
df_measurement['bookingID'] = df_measurement['bookingID'].astype(str)
df_label['bookingID'] = df_label['bookingID'].astype(str)
# combine the label and the measurement
df_merge = df_measurement.merge(df_label, on='bookingID')

In [None]:
# drop booking ID which has less than 30 readings
count_booking = df_merge.groupby('bookingID').Speed.count()
id_to_be_dropped = list(count_booking[count_booking > 60].index)
df_merge = df_merge.drop(df_merge[df_merge['bookingID'].isin(id_to_be_dropped)].index, axis=0)

In [None]:
# group dataframe by bookingId
grouped = df_merge.groupby('bookingID')
groups = dict(list(grouped))

In [None]:
# extract the g, this might take a while depending on how many data there is
df_g = tools.create_gravity_adjustment_df(groups)

In [None]:
# clean and reorient data, this might take a while depending on how many data there is
df_merge = tools.process_clean_and_reorient(df_merge, df_g)

In [None]:
# group reoriented dataframe by bookingId
grouped_feature = df_merge.groupby('bookingID')
groups_feature = dict(list(grouped_feature))

In [None]:
# extract features, there might be some warnings if the bookingID only has few measurements
df_features, df_label = tools.extract_features(groups_feature)

In [None]:
# sometimes the features resulted in infinity or nan, replace them with 0
df_features.replace([np.inf, -np.inf], np.nan, inplace=True)
df_features.fillna(0, inplace=True)

In [None]:
# prepare the data for prediction and result
X_test = df_features
y_test = df_label.values.ravel()

In [None]:
# show confusion matrix and calculate score
y_pred = rf.predict(X_test)
print("Confusion Matrix \n", confusion_matrix(y_test,y_pred))
print("Precision", precision_score(y_test,y_pred))
print("Recall", recall_score(y_test,y_pred))
print("Accuracy", accuracy_score(y_test,y_pred))
y_pred_proba = rf.predict_proba(X_test)
print("ROC AUC", roc_auc_score(y_test, y_pred_proba[:,1]))