In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split, StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, plot_precision_recall_curve, plot_roc_curve, roc_curve, confusion_matrix
from sklearn.feature_selection import RFE

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, KBinsDiscretizer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report

import datetime as dt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

## Continue modelling

In [None]:
ready = pd.read_csv('new_ready.csv').drop(['lat', 'lon'],axis=1)
ready

In [None]:
test = pd.read_csv('new_test.csv')
test

In [None]:
features = 'x y z day month hour_y'.split()
label = 'Labels'

X = ready[features]
y = ready[label]

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, stratify=y, random_state=24, test_size=.2)

In [None]:
kbins = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')

transformer = ColumnTransformer([
        ('poly', PolynomialFeatures(degree=2, include_bias=False), ['x','y','z']),
        ('kbins', kbins, ['day','hour_y'])
], remainder='passthrough')

logit_smo = Pipeline([
    ('trans', transformer),
    ('smote', SMOTE(random_state=2020)),
    ('clf', LogisticRegression(random_state=2020))
])

In [None]:
param_space = {
    'trans__poly__degree': [2,3],
    'smote__k_neighbors': [3,4,5],
    'clf__C': [0.01,0.1,1],
    'clf__solver': ['lbfgs', 'liblinear']
}

In [None]:
skf = StratifiedKFold()

logit_grid = GridSearchCV(
    logit_smo,
    param_grid=param_space,
    scoring='f1',
    cv = skf,
    n_jobs=-1,
    verbose=2
 )

In [None]:
# logit_grid.fit(X_train_val, y_train_val)

In [None]:
logit_grid.best_params_

In [None]:
logit_tuned = logit_grid.best_estimator_

In [None]:
logit_tuned.fit(X_train_val, y_train_val)

In [None]:
logit_smo.fit(X_train_val, y_train_val)

In [None]:
logit_tuned_pred = logit_tuned.predict(X_test)
print(classification_report(y_test, logit_tuned_pred))

In [None]:
logit_pred = logit_smo.predict(X_test)
print(classification_report(y_test, logit_pred))

In [None]:
plot_roc_curve(logit_tuned, X_test, y_test)

In [None]:
logit_prob1 = logit_tuned.predict_proba(X_test)[:,1]
fpr, tpr, thr = roc_curve(y_test, logit_prob1)

In [None]:
roc_df = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'thr': thr})
roc_df[roc_df['tpr'].between(0.8, 0.9)]

In [None]:
new_thr = 0.371134
y_pred037 = np.where(logit_tuned.predict_proba(X_test)[:,1]>new_thr,1,0)
print(classification_report(y_test, y_pred037))

In [None]:
confusion_matrix(y_test, y_pred037)

## VotingClassifier

In [None]:
# DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=10, min_samples_split=100)
# dtc_cal = CalibratedClassifierCV(dtc, method='isotonic')

# GaussianNB
gnb = GaussianNB()

# VotingClassifier
vclf = VotingClassifier([('logit',logit_tuned), ('dtc', dtc), ('gaussian', gnb)], voting='soft', n_jobs=-1)

In [None]:
dtc_cal

In [None]:
param_space_vclf = {
    'dtc__max_depth': [10,20],
    'dtc__min_samples_split': [100, 200],
}

In [None]:
vclf_grid = GridSearchCV(
    vclf,
    param_grid=param_space_vclf,
    scoring='f1',
    cv = skf,
    n_jobs=-1,
    verbose=2
 )

vclf_grid.fit(X_train_val, y_train_val)

In [None]:
vclf_grid.best_params_

In [None]:
# DecisionTreeClassifier
dtc_tuned = DecisionTreeClassifier(max_depth=20, min_samples_split=100, class_weight='balanced')
# dtc_cal = CalibratedClassifierCV(dtc, method='isotonic')

# GaussianNB
gnb = GaussianNB()

# VotingClassifier
vclf_tuned = VotingClassifier([('logit',logit_tuned), ('dtc', dtc_tuned), ('gaussian', gnb)], voting='soft', n_jobs=-1)

In [None]:
vclf_tuned.fit(X_train_val, y_train_val)
print('done fitting!')

In [None]:
vclf_pred_tuned = vclf_tuned.predict(X_test)
print(classification_report(y_test, vclf_pred_tuned))
display(confusion_matrix(y_test, vclf_pred_tuned))

In [None]:
plot_roc_curve(vclf_tuned, X_test, y_test)

In [None]:
fpr, tpr, thr = roc_curve(y_test, vclf_tuned.predict_proba(X_test)[:,1])
roc_df = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'thr': thr})
roc_df[roc_df['tpr'].between(0.95, 1.0)].sort_values('fpr').head()

In [None]:
new_thr = 0.567104
y_pred0567 = np.where(vclf_tuned.predict_proba(X_test)[:,1]>new_thr,1,0)


print(classification_report(y_test, y_pred0567))
display(confusion_matrix(y_test, y_pred0567))

In [None]:
vclf_tuned.fit(X,y)
submit = vclf_tuned.predict(test)

In [None]:
len(submit)

In [None]:
test_submit = pd.read_csv('data_test.csv')
test_submit['Labels'] = submit

test_submit.head()

In [None]:
test_submit.to_csv('submit2.csv', index=False)

## Try another technique

In [2]:
ready_clean = pd.read_csv('ready_clean.csv')
db_street = pd.read_csv('db_street.csv')
db_road_type = pd.read_csv('db_road_type.csv')
db_jam_level = pd.read_csv('db_jam_level.csv')
ready_clean

Unnamed: 0,s2token_15,lat,lon,day,month,Labels,hour_y,street_y,road_type,jam_level
0,2e698e484,-6.270088,107.043609,25,9,True,10,Lambang Jaya,2,4
1,2e698e484,-6.270088,107.043609,25,9,True,10,Lambang Jaya,2,4
2,2e698e484,-6.270088,107.043609,25,9,True,10,Lambang Jaya,2,4
3,2e698e484,-6.270088,107.043609,25,9,True,10,Lambang Jaya,2,3
4,2e698e484,-6.270088,107.043609,25,9,True,10,Lambang Jaya,2,4
...,...,...,...,...,...,...,...,...,...,...
3674059,2e68e6874,-6.886813,107.581583,22,11,True,17,Prof Dr Surya Sumantri,2,4
3674060,2e68e6874,-6.886813,107.581583,22,11,True,17,Prof Dr Surya Sumantri,2,4
3674061,2e68e6874,-6.886813,107.581583,22,11,True,17,Prof Dr Surya Sumantri,2,4
3674062,2e68e6874,-6.886813,107.581583,22,11,True,17,Prof Dr Surya Sumantri,2,4


In [3]:
def ids_to_df(df):
    test_df2 = pd.DataFrame()
    test_df2['s2cell_token'] = df['Ids'].str.split('_').apply(lambda x: x[0])
    test_df2['date'] = df['Ids'].str.split('_').apply(lambda x: x[1])
    test_df2['date'] = test_df2['date'].str.split('-')
    test_df2['hour'] = df['Ids'].str.split('_').apply(lambda x: x[2])
    
    test_df2['year'] = test_df2['date'].apply(lambda x: int(x[0]))
    test_df2['month'] = test_df2['date'].apply(lambda x: int(x[1]))
    test_df2['day'] = test_df2['date'].apply(lambda x: int(x[2]))
    test_df2['hour'] = test_df2['hour'].astype('int')
    
    test_df2['lat'] = test_df2['s2cell_token'].apply(s2cell.token_to_lat_lon).apply(lambda x: x[0])
    test_df2['lon'] = test_df2['s2cell_token'].apply(s2cell.token_to_lat_lon).apply(lambda x: x[1])
    
    test_df2['x'] = np.cos(test_df2['lat']) * np.cos(test_df2['lon'])
    test_df2['y'] = np.cos(test_df2['lat']) * np.sin(test_df2['lon'])
    test_df2['z'] = np.sin(test_df2['lat'])
    
    test_df2 = test_df2[['x', 'y', 'z', 'day', 'month', 'hour']]
    
    return test_df2

In [4]:
ready_clean['x'] = np.cos(ready_clean['lat']) * np.cos(ready_clean['lon'])
ready_clean['y'] = np.cos(ready_clean['lat']) * np.sin(ready_clean['lon'])
ready_clean['z'] = np.sin(ready_clean['lat'])
ready_clean.drop(['lat', 'lon'], axis=1, inplace=True)

In [5]:
ready_clean.drop('s2token_15', axis=1, inplace=True)

In [6]:
ready_clean.head()

Unnamed: 0,day,month,Labels,hour_y,street_y,road_type,jam_level,x,y,z
0,25,9,True,10,Lambang Jaya,2,4,0.973706,0.227431,0.013097
1,25,9,True,10,Lambang Jaya,2,4,0.973706,0.227431,0.013097
2,25,9,True,10,Lambang Jaya,2,4,0.973706,0.227431,0.013097
3,25,9,True,10,Lambang Jaya,2,3,0.973706,0.227431,0.013097
4,25,9,True,10,Lambang Jaya,2,4,0.973706,0.227431,0.013097


In [7]:
ready_clean.isna().sum()

day          0
month        0
Labels       0
hour_y       0
street_y     0
road_type    0
jam_level    0
x            0
y            0
z            0
dtype: int64

In [8]:
# binary
ready_clean['street_y'].nunique()

277

In [9]:
import category_encoders as ce

In [15]:
pipe_street = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('binary', ce.BinaryEncoder())
])

transformer = ColumnTransformer([
    ('binary', pipe_street, ['street_y']),
    ('poly', PolynomialFeatures(degree=3), ['x', 'y', 'z']),
    ('kbins', KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform'), ['day', 'hour_y']),
    ('imputer', SimpleImputer(strategy='median'), ['road_type', 'jam_level'])
], remainder='passthrough')

logit_new_pipe = Pipeline([
    ('trans', transformer),
    ('fs', RFE(LogisticRegression(C=0.01, solver='liblinear', random_state=2020, verbose=1, class_weight='balanced'))),
    ('clf', LogisticRegression(C=0.01, solver='liblinear', random_state=2020, verbose=1, class_weight='balanced'))
])

In [11]:
X = ready_clean['x y z day month hour_y street_y road_type jam_level'.split()]
y = np.where(ready_clean['Labels']==True,1,0)

In [12]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, stratify=y, test_size=.2, random_state=2020)

In [13]:
X_train_val.head()

Unnamed: 0,x,y,z,day,month,hour_y,street_y,road_type,jam_level
2508340,0.990932,0.099001,-0.090839,7,11,14,Trans Yogi,7,4
2892768,0.99176,0.087273,-0.093783,4,10,12,Trans Yogi,7,4
3350205,0.596603,0.596351,-0.537057,17,10,14,Dr Setiabudi,7,4
2457883,0.976888,0.213017,0.017714,2,11,19,Lambang Jaya,2,3
210920,0.969386,0.24461,0.021347,17,11,10,N1 Sultan Hasanudin,6,4


In [14]:
y_train_val

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
logit_new_pipe.fit(X_train_val, y_train_val)

In [None]:
y_pred = logit_new_pipe.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
plot_roc_curve(logit_new_pipe, X_test, y_test)

In [None]:
import joblib
joblib.dump(logit_new_pipe, 'logit_new')

In [None]:
y_prob1 = logit_new_pipe.predict_proba(X_test)[:,1]

In [None]:
fpr, tpr, thr = roc_curve(y_test, y_prob1)
roc_df = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'thr': thr})
roc_df[roc_df['tpr'].between(0.95,1)]

In [None]:
# 0.889415 macro 0.67
# 0.875739 macro 0.67
# 0.797893 macro 0.74

new_thr = 0.797893

y_pred088 = np.where(logit_new_pipe.predict_proba(X_test)[:,1]>new_thr,1,0)
print(classification_report(y_test, y_pred088))
display(confusion_matrix(y_test, y_pred088))

In [None]:
# final_logit = logit_new_pipe.fit(X, y)

In [None]:
test = pd.read_csv('data_test.csv')
test

In [None]:
X_train_val.head()

In [None]:
import s2cell

In [None]:
def ids_to_df(df):
    test_df2 = pd.DataFrame()
    test_df2['s2cell_token'] = df['Ids'].str.split('_').apply(lambda x: x[0])
    test_df2['date'] = df['Ids'].str.split('_').apply(lambda x: x[1])
    test_df2['date'] = test_df2['date'].str.split('-')
    test_df2['hour_y'] = df['Ids'].str.split('_').apply(lambda x: x[2])
    test_df2['token_hour'] = test_df2['s2cell_token']+'_'+test_df2['hour_y'] 
    
    test_df2['year'] = test_df2['date'].apply(lambda x: int(x[0]))
    test_df2['month'] = test_df2['date'].apply(lambda x: int(x[1]))
    test_df2['day'] = test_df2['date'].apply(lambda x: int(x[2]))
    test_df2['hour_y'] = test_df2['hour_y'].astype('int')
    
    test_df2['lat'] = test_df2['s2cell_token'].apply(s2cell.token_to_lat_lon).apply(lambda x: x[0])
    test_df2['lon'] = test_df2['s2cell_token'].apply(s2cell.token_to_lat_lon).apply(lambda x: x[1])
    
    test_df2['x'] = np.cos(test_df2['lat']) * np.cos(test_df2['lon'])
    test_df2['y'] = np.cos(test_df2['lat']) * np.sin(test_df2['lon'])
    test_df2['z'] = np.sin(test_df2['lat'])
    
    test_df2 = test_df2[['token_hour', 'x', 'y', 'z', 'day', 'month', 'hour_y']]
    
    return test_df2

In [None]:
new_test = ids_to_df(test)

In [None]:
len(new_test)

In [None]:
# db_street
# db_road_type
# db_jam_level

In [None]:
merge_street = pd.merge(new_test, db_street, how='left')

In [None]:
merge_road = pd.merge(merge_street, db_road_type, how='left')
merge_road

In [None]:
merge_jam_level = pd.merge(merge_road, db_jam_level, how='left')
merge_jam_level

In [None]:
merge_jam_level.isna().sum()