## Importing Packages

In [1]:
from math import sin, cos
import s2cell

import numpy as np
import pandas as pd
import datetime as dt

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats
import pingouin as pg

import warnings
warnings.filterwarnings('ignore')

import category_encoders as ce
import sklearn

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, f1_score, recall_score, precision_score, plot_confusion_matrix
from sklearn.metrics import roc_auc_score, balanced_accuracy_score
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline as imbPipeline

import warnings
warnings.filterwarnings('ignore')

sns.set()
pd.set_option('display.max_columns', 500)

In [2]:
samp_df = pd.read_csv('sample_submission.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'sample_submission.csv'

In [None]:
test_df = pd.read_csv('data_test.csv')

In [None]:
train_df = pd.read_csv('data_train.csv')

In [None]:
raw_irre_df = pd.read_csv('irregularities.csv')

In [None]:
raw_alerts_df = pd.read_csv('alerts.csv')

## Checking Raw Dataset

> #### Data Sample Submission CSV

In [None]:
samp_df.head(3)

In [None]:
samp_df.info()

> #### Data Test CSV

In [None]:
test_df.head(3)

In [None]:
test_df.info()

> #### Data Train CSV

In [None]:
train_df.head(3)

In [None]:
train_df['Ids'].min()

In [None]:
train_df['Ids'].max()

In [None]:
train_df.info()

> #### Data Irregularities CSV

In [None]:
raw_irre_df.head(3)

In [None]:
raw_irre_df.info()

In [None]:
raw_irre_df.describe(include='all')

In [None]:
# Checking raw datasets

desc = []
for i in raw_irre_df.columns:
    desc.append([i,
                raw_irre_df[i].dtypes,
                raw_irre_df[i].isna().sum(),
                round((raw_irre_df[i].isna().sum())/len(raw_irre_df)*100,2),
                raw_irre_df[i].nunique(),
                raw_irre_df[i].sample(2).values])
pd.DataFrame(desc, columns = ['dataFeatures', 'dataType',' null', 'nullPct', 'unique', 'uniqueSample'])

In [None]:
a_tes1 = dt.datetime.fromtimestamp(raw_irre_df['detection_date_millis'][0]/1000.0)  # checking

In [None]:
a_tes2 = dt.datetime.fromtimestamp(raw_irre_df['update_date_millis'][0]/1000.0)  # checking

In [None]:
a_tes1, a_tes2

> #### Alerts CSV

In [None]:
raw_alerts_df.head(3)

In [None]:
raw_alerts_df.info()

In [None]:
raw_alerts_df.describe(include='all')

In [None]:
# Checking raw datasets

desc = []
for i in raw_alerts_df.columns:
    desc.append([i,
                raw_alerts_df[i].dtypes,
                raw_alerts_df[i].isna().sum(),
                round((raw_alerts_df[i].isna().sum())/len(raw_alerts_df)*100,2),
                raw_alerts_df[i].nunique(),
                raw_alerts_df[i].sample(2).values])
pd.DataFrame(desc, columns = ['dataFeatures', 'dataType',' null', 'nullPct', 'unique', 'uniqueSample'])

## Preprocessing

> #### Adding "Ids" Column in "Alerts"

In [None]:
alerts_df = raw_alerts_df.copy()

In [None]:
alerts_df.head(3)

In [None]:
alerts_df['dates'] = [dt.datetime.fromtimestamp(i/1000.0) for i in alerts_df['pub_millis']]

In [None]:
alerts_df['dates'][:3]  # checking

In [None]:
alerts_df['date'] = [i.strftime("%Y-%m-%d") for i in alerts_df['dates']]
alerts_df['hour'] = [i.strftime("%H") for i in alerts_df['dates']]
alerts_df['hour'] = alerts_df['hour'].str.replace("0", "", 1)
alerts_df['Ids'] = alerts_df.s2token_15 + '_' + alerts_df.date + '_' + alerts_df.hour

In [None]:
alerts_df[['Ids']]  # checking

> #### Adding "Ids" Column in "Irregularities"

In [None]:
irre_df = raw_irre_df.copy()

In [None]:
irre_df.head(3)

In [None]:
irre_df['dates'] = [dt.datetime.fromtimestamp(i/1000.0) for i in irre_df['detection_date_millis']]

In [None]:
irre_df['dates'][:3]  # checking

In [None]:
irre_df['date'] = [i.strftime("%Y-%m-%d") for i in irre_df['dates']]
irre_df['hour'] = [i.strftime("%H") for i in irre_df['dates']]
irre_df['hour'] = irre_df['hour'].str.replace("0", "", 1)
irre_df['Ids'] = irre_df.s2token_center + '_' + irre_df.date + '_' + irre_df.hour

In [None]:
irre_df[['Ids']]  # checking

> #### Combining "Alerts" and "Irregularities"

In [None]:
comb_df = pd.merge(alerts_df, irre_df, on='Ids')

In [None]:
# comb_df = pd.merge(alerts_df, irre_df, left_on=['Ids', 's2token_15', 's2id_15', 'street'], right_on=['Ids', 's2token_center', 's2id_center', 'street'])  # checking

In [None]:
comb_df = pd.merge(train_df, comb_df, on='Ids')

In [None]:
comb_df

In [None]:
(comb_df.isna().sum()/len(comb_df)*100).nlargest(10)

> #### Removing Features

In [None]:
comb_df.drop(columns=['report_description', 'report_by_municipality_user'], inplace=True)

> #### Converting Labels to Zeros and Ones

In [None]:
comb_df['Labels'] = np.where(comb_df['Labels']==True, 1, 0)

In [None]:
comb_df.head(3)

In [None]:
comb_df.columns

> #### Merging Columns

In [None]:
data = comb_df[['Ids', 'Labels', 's2id_15', 's2token_15', 'road_type', 'street_x', 'city_x', 'magvar', 'reliability', 'report_rating', 'confidence', 'type_x', 'subtype', 'longitude', 'latitude', 'dates_x', 'date_x', 'hour_x', 'is_highway', 'line', 'speed',
       'regular_speed', 'delay_seconds', 'seconds', 'length', 'trend', 'type_y', 'severity', 'jam_level', 'drivers_count', 'alerts_count', 'n_thumbs_up_y']]

In [None]:
data.head(3)

In [None]:
# Checking

print(len(data.columns))
print(comb_df['s2token_15'].nunique())

- __Determining the Labels__

In [None]:
data[data['Labels']==1]['jam_level'].value_counts()

In [None]:
data[data['Labels']==0]['jam_level'].value_counts()

1 is High Jam Reports and 0 is No Congestion 

In [None]:
# Data is highly unbalanced, balancing method needs to be implemented

data['Labels'].value_counts(normalize=True).plot(kind='pie')
plt.title('Ratio Between High Jam Reports (1) and No Congestion (0)')

In [None]:
data_ml = data[['latitude', 'longitude', 'date_x', 'hour_x', 'Labels']]
data_ml.rename(columns={'date_x': 'date', 'hour_x': 'hour'}, inplace=True)

In [None]:
data_ml['date'] = data_ml['date'].str.split('-')

In [None]:
data_ml['year'] = data_ml['date'].apply(lambda x: int(x[0]))
data_ml['month'] = data_ml['date'].apply(lambda x: int(x[1]))
data_ml['day'] = data_ml['date'].apply(lambda x: int(x[2]))
data_ml['hour'] = data_ml['hour'].astype('int')

In [None]:
data_ml['x'] = data_ml['latitude'].apply(cos) * data_ml['longitude'].apply(cos)
data_ml['y'] = data_ml['latitude'].apply(cos) * data_ml['longitude'].apply(sin)
data_ml['z'] = data_ml['latitude'].apply(sin)

In [None]:
data_ml = data_ml[['x', 'y', 'z', 'year', 'month', 'day', 'hour', 'Labels']]

In [None]:
data_ml.head(3)

> #### Selecting Feature and Target

In [None]:
X = data_ml[['x', 'y', 'z', 'year', 'month', 'day', 'hour']]
y = data_ml['Labels']

In [None]:
X.head()

In [None]:
y[:5]

> #### Splitting Training-validation Set and Testing Set

In [None]:
# Splitting training-validation set dan testing set

X_train_val, X_test, y_train_val, y_test= train_test_split(X,y,stratify=y, random_state=2021,test_size=0.2)
skfold = StratifiedKFold(n_splits = 5, random_state=2021, shuffle=True)

## Creating Benchmark Models

In [63]:
sklearn.metrics.SCORERS.keys() 

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

> #### Creating Logistic Regression Benchmark 

In [64]:
logreg = LogisticRegression(random_state=2021)

transformer_logreg_bench = ColumnTransformer([ 
    ('ordinal', ce.OrdinalEncoder(), ['year', 'month', 'day', 'hour'])
], remainder='passthrough')

pipe_logreg_bench = imbPipeline([
    ('transformer', transformer_logreg_bench),
    ('logreg', logreg)
])

In [65]:
logreg_bench_cv = cross_val_score(pipe_logreg_bench, X_train_val, y_train_val, cv=skfold, scoring='f1')
print('CV Logreg Benchmark:', logreg_bench_cv)
print('CV Logreg Benchmark Mean:', logreg_bench_cv.mean())

CV Logreg Benchmark: [0.98398971 0.98398161 0.98401905 0.98396944 0.98398563]
CV Logreg Benchmark Mean: 0.9839890860990159


> #### Creating Decision Tree Classifier Benchmark 

In [66]:
dtc = DecisionTreeClassifier(random_state=2021)

transformer_dtc_bench = ColumnTransformer([ 
    ('ordinal', ce.OrdinalEncoder(), ['year', 'month', 'day', 'hour'])
], remainder='passthrough')

pipe_dtc_bench = Pipeline([
    ('transformer', transformer_dtc_bench),
    ('dtc', dtc)
])

In [67]:
dtc_bench_cv = cross_val_score(pipe_dtc_bench, X_train_val, y_train_val, cv=skfold, scoring='f1')
print('CV DTC Benchmark:', dtc_bench_cv)
print('CV DTC Benchmark Mean:', dtc_bench_cv.mean())

CV DTC Benchmark: [1.         1.         1.         0.99999899 1.        ]
CV DTC Benchmark Mean: 0.9999997976566618


> #### Creating XGBoost Classifier Benchmark 

In [68]:
xgbc = XGBClassifier(random_state=2021)

transformer_xgbc_bench = ColumnTransformer([ 
    ('ordinal', ce.OrdinalEncoder(), ['year', 'month', 'day', 'hour'])
], remainder='passthrough')

pipe_xgbc_bench = Pipeline([
    ('transformer', transformer_xgbc_bench),
    ('xgbc', xgbc)
])

In [69]:
xgbc_bench_cv = cross_val_score(pipe_xgbc_bench, X_train_val, y_train_val, cv=skfold, scoring='f1')
print('CV XGBC Benchmark:', xgbc_bench_cv)
print('CV XGBC Benchmark Mean:', xgbc_bench_cv.mean())

CV XGBC Benchmark: [0.999913   0.99996864 0.99994031 0.99992918 0.99994638]
CV XGBC Benchmark Mean: 0.9999395031292432


> #### Creating Ada Boost Classifier Benchmark 

In [70]:
abc = AdaBoostClassifier(random_state=2021)

transformer_abc_bench = ColumnTransformer([ 
    ('ordinal', ce.OrdinalEncoder(), ['year', 'month', 'day', 'hour'])
], remainder='passthrough')

pipe_abc_bench = Pipeline([
    ('transformer', transformer_abc_bench),
    ('abc', abc)
])

In [71]:
abc_bench_cv = cross_val_score(pipe_abc_bench, X_train_val, y_train_val, cv=skfold, scoring='f1')
print('CV ABC Benchmark:', abc_bench_cv)
print('CV ABC Benchmark Mean:', abc_bench_cv.mean())

CV ABC Benchmark: [0.99326289 0.99317968 0.99320714 0.99313316 0.99310733]
CV ABC Benchmark Mean: 0.9931780380522739


> #### Test Iseng

In [72]:
samp_df.head()

Unnamed: 0,Ids,Labels
0,2e69c8bc4_2020-11-29_18,False
1,2e6992f24_2020-11-27_18,True
2,2e69e8dfc_2020-11-28_15,True
3,2e69c47f4_2020-11-26_19,True
4,2e68e64d4_2020-11-29_11,True


In [73]:
test_df.head()

Unnamed: 0,Ids
0,2e6992a84_2020-11-25_18
1,2e68e62f4_2020-11-29_20
2,2e68e81a4_2020-11-27_10
3,2e69eec04_2020-11-24_7
4,2e698e4a4_2020-11-27_8


In [74]:
# Creating function to convert Ids Table into Proper Independent DF Table for Testing

def ids_to_df(df):
    test_df2 = pd.DataFrame()
    test_df2['s2cell_token'] = df['Ids'].str.split('_').apply(lambda x: x[0])
    test_df2['date'] = df['Ids'].str.split('_').apply(lambda x: x[1])
    test_df2['date'] = test_df2['date'].str.split('-')
    test_df2['hour'] = df['Ids'].str.split('_').apply(lambda x: x[2])
    
    test_df2['year'] = test_df2['date'].apply(lambda x: int(x[0]))
    test_df2['month'] = test_df2['date'].apply(lambda x: int(x[1]))
    test_df2['day'] = test_df2['date'].apply(lambda x: int(x[2]))
    test_df2['hour'] = test_df2['hour'].astype('int')
    
    test_df2['latitude'] = test_df2['s2cell_token'].apply(s2cell.token_to_lat_lon).apply(lambda x: x[0])
    test_df2['longitude'] = test_df2['s2cell_token'].apply(s2cell.token_to_lat_lon).apply(lambda x: x[1])
    
    test_df2['x'] = test_df2['latitude'].apply(cos) * test_df2['longitude'].apply(cos)
    test_df2['y'] = test_df2['latitude'].apply(cos) * test_df2['longitude'].apply(sin)
    test_df2['z'] = test_df2['latitude'].apply(sin)
    
    test_df2 = test_df2[['x', 'y', 'z', 'year', 'month', 'day', 'hour']]
    
    return test_df2

In [75]:
ids_to_df(test_df)

Unnamed: 0,x,y,z,year,month,day,hour
0,0.994031,0.108207,-0.013921,2020,11,25,18
1,0.561143,0.574345,-0.596025,2020,11,29,20
2,0.525794,0.591784,-0.611009,2020,11,27,10
3,0.996602,-0.021315,-0.079557,2020,11,24,7
4,0.972344,0.233176,0.013294,2020,11,27,8
...,...,...,...,...,...,...,...
13836,0.520341,0.621773,-0.585357,2020,11,26,5
13837,0.953392,0.301451,0.013105,2020,11,24,22
13838,0.991486,-0.047556,-0.121218,2020,11,24,10
13839,0.951855,0.294791,-0.084091,2020,11,24,18


> #### DTC Bench

In [76]:
pipe_dtc_bench.fit(X, y)

Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ordinal', OrdinalEncoder(),
                                                  ['year', 'month', 'day',
                                                   'hour'])])),
                ('dtc', DecisionTreeClassifier(random_state=2021))])

In [77]:
y_pred_dtc_bench = pipe_dtc_bench.predict(ids_to_df(test_df))

In [78]:
result_dtc_bench = test_df.copy()
result_dtc_bench['Labels'] = y_pred_dtc_bench
result_dtc_bench['Labels'] = np.where(result_dtc_bench['Labels']==1, True, False)

In [79]:
result_dtc_bench

Unnamed: 0,Ids,Labels
0,2e6992a84_2020-11-25_18,True
1,2e68e62f4_2020-11-29_20,True
2,2e68e81a4_2020-11-27_10,True
3,2e69eec04_2020-11-24_7,True
4,2e698e4a4_2020-11-27_8,True
...,...,...
13836,2e68dd414_2020-11-26_5,True
13837,2e698541c_2020-11-24_22,True
13838,2e69e8e0c_2020-11-24_10,True
13839,2e699a1cc_2020-11-24_18,False


In [80]:
# result_dtc_bench.to_csv('TruePositive_1.csv', index=False)