In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split, StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, plot_precision_recall_curve, plot_roc_curve, roc_curve, confusion_matrix
from sklearn.feature_selection import RFE

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, KBinsDiscretizer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report

import datetime as dt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

## Continue modelling

In [None]:
ready = pd.read_csv('new_ready.csv').drop(['lat', 'lon'],axis=1)
ready

In [None]:
test = pd.read_csv('new_test.csv')
test

In [None]:
features = 'x y z day month hour_y'.split()
label = 'Labels'

X = ready[features]
y = ready[label]

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, stratify=y, random_state=24, test_size=.2)

In [None]:
kbins = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')

transformer = ColumnTransformer([
        ('poly', PolynomialFeatures(degree=2, include_bias=False), ['x','y','z']),
        ('kbins', kbins, ['day','hour_y'])
], remainder='passthrough')

logit_smo = Pipeline([
    ('trans', transformer),
    ('smote', SMOTE(random_state=2020)),
    ('clf', LogisticRegression(random_state=2020))
])

In [None]:
param_space = {
    'trans__poly__degree': [2,3],
    'smote__k_neighbors': [3,4,5],
    'clf__C': [0.01,0.1,1],
    'clf__solver': ['lbfgs', 'liblinear']
}

In [None]:
skf = StratifiedKFold()

logit_grid = GridSearchCV(
    logit_smo,
    param_grid=param_space,
    scoring='f1',
    cv = skf,
    n_jobs=-1,
    verbose=2
 )

In [None]:
# logit_grid.fit(X_train_val, y_train_val)

In [None]:
logit_grid.best_params_

In [None]:
logit_tuned = logit_grid.best_estimator_

In [None]:
logit_tuned.fit(X_train_val, y_train_val)

In [None]:
logit_smo.fit(X_train_val, y_train_val)

In [None]:
logit_tuned_pred = logit_tuned.predict(X_test)
print(classification_report(y_test, logit_tuned_pred))

In [None]:
logit_pred = logit_smo.predict(X_test)
print(classification_report(y_test, logit_pred))

In [None]:
plot_roc_curve(logit_tuned, X_test, y_test)

In [None]:
logit_prob1 = logit_tuned.predict_proba(X_test)[:,1]
fpr, tpr, thr = roc_curve(y_test, logit_prob1)

In [None]:
roc_df = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'thr': thr})
roc_df[roc_df['tpr'].between(0.8, 0.9)]

In [None]:
new_thr = 0.371134
y_pred037 = np.where(logit_tuned.predict_proba(X_test)[:,1]>new_thr,1,0)
print(classification_report(y_test, y_pred037))

In [None]:
confusion_matrix(y_test, y_pred037)

## VotingClassifier

In [None]:
# DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=10, min_samples_split=100)
# dtc_cal = CalibratedClassifierCV(dtc, method='isotonic')

# GaussianNB
gnb = GaussianNB()

# VotingClassifier
vclf = VotingClassifier([('logit',logit_tuned), ('dtc', dtc), ('gaussian', gnb)], voting='soft', n_jobs=-1)

In [None]:
dtc_cal

In [None]:
param_space_vclf = {
    'dtc__max_depth': [10,20],
    'dtc__min_samples_split': [100, 200],
}

In [None]:
vclf_grid = GridSearchCV(
    vclf,
    param_grid=param_space_vclf,
    scoring='f1',
    cv = skf,
    n_jobs=-1,
    verbose=2
 )

vclf_grid.fit(X_train_val, y_train_val)

In [None]:
vclf_grid.best_params_

In [None]:
# DecisionTreeClassifier
dtc_tuned = DecisionTreeClassifier(max_depth=20, min_samples_split=100, class_weight='balanced')
# dtc_cal = CalibratedClassifierCV(dtc, method='isotonic')

# GaussianNB
gnb = GaussianNB()

# VotingClassifier
vclf_tuned = VotingClassifier([('logit',logit_tuned), ('dtc', dtc_tuned), ('gaussian', gnb)], voting='soft', n_jobs=-1)

In [None]:
vclf_tuned.fit(X_train_val, y_train_val)
print('done fitting!')

In [None]:
vclf_pred_tuned = vclf_tuned.predict(X_test)
print(classification_report(y_test, vclf_pred_tuned))
display(confusion_matrix(y_test, vclf_pred_tuned))

In [None]:
plot_roc_curve(vclf_tuned, X_test, y_test)

In [None]:
fpr, tpr, thr = roc_curve(y_test, vclf_tuned.predict_proba(X_test)[:,1])
roc_df = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'thr': thr})
roc_df[roc_df['tpr'].between(0.95, 1.0)].sort_values('fpr').head()

In [None]:
new_thr = 0.567104
y_pred0567 = np.where(vclf_tuned.predict_proba(X_test)[:,1]>new_thr,1,0)


print(classification_report(y_test, y_pred0567))
display(confusion_matrix(y_test, y_pred0567))

In [None]:
vclf_tuned.fit(X,y)
submit = vclf_tuned.predict(test)

In [None]:
len(submit)

In [None]:
test_submit = pd.read_csv('data_test.csv')
test_submit['Labels'] = submit

test_submit.head()

In [None]:
test_submit.to_csv('submit2.csv', index=False)

## Try another technique

In [None]:
ready_clean = pd.read_csv('ready_clean.csv')
db_street = pd.read_csv('db_street.csv')
db_road_type = pd.read_csv('db_road_type.csv')
db_jam_level = pd.read_csv('db_jam_level.csv')
ready_clean

In [None]:
ready = pd.read_csv('ready.csv')

In [None]:
def ids_to_df(df):
    test_df2 = pd.DataFrame()
    test_df2['s2cell_token'] = df['Ids'].str.split('_').apply(lambda x: x[0])
    test_df2['date'] = df['Ids'].str.split('_').apply(lambda x: x[1])
    test_df2['date'] = test_df2['date'].str.split('-')
    test_df2['hour'] = df['Ids'].str.split('_').apply(lambda x: x[2])
    
    test_df2['year'] = test_df2['date'].apply(lambda x: int(x[0]))
    test_df2['month'] = test_df2['date'].apply(lambda x: int(x[1]))
    test_df2['day'] = test_df2['date'].apply(lambda x: int(x[2]))
    test_df2['hour'] = test_df2['hour'].astype('int')
    
    test_df2['lat'] = test_df2['s2cell_token'].apply(s2cell.token_to_lat_lon).apply(lambda x: x[0])
    test_df2['lon'] = test_df2['s2cell_token'].apply(s2cell.token_to_lat_lon).apply(lambda x: x[1])
    
    test_df2['x'] = np.cos(test_df2['lat']) * np.cos(test_df2['lon'])
    test_df2['y'] = np.cos(test_df2['lat']) * np.sin(test_df2['lon'])
    test_df2['z'] = np.sin(test_df2['lat'])
    
    test_df2 = test_df2[['x', 'y', 'z', 'day', 'month', 'hour']]
    
    return test_df2

In [None]:
ready_clean['x'] = np.cos(ready_clean['lat']) * np.cos(ready_clean['lon'])
ready_clean['y'] = np.cos(ready_clean['lat']) * np.sin(ready_clean['lon'])
ready_clean['z'] = np.sin(ready_clean['lat'])
ready_clean.drop(['lat', 'lon'], axis=1, inplace=True)

In [None]:
ready_clean.drop('s2token_15', axis=1, inplace=True)

In [None]:
ready_clean.head()

In [None]:
ready_clean.isna().sum()

In [None]:
# binary
ready_clean['street_y'].nunique()

In [None]:
import category_encoders as ce

In [None]:
transformer = ColumnTransformer([
    ('binary', ce.BinaryEncoder(), ['street_y']),
    ('poly', PolynomialFeatures(degree=3), ['x', 'y', 'z']),
    ('kbins', KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform'), ['day', 'hour_y'])
], remainder='passthrough')

logit_new_pipe = Pipeline([
    ('trans', transformer),
    ('fs', RFE(LogisticRegression(C=0.01, solver='liblinear', random_state=2020, verbose=1, class_weight='balanced'))),
    ('clf', LogisticRegression(C=0.01, solver='liblinear', random_state=2020, verbose=1, class_weight='balanced'))
])

In [None]:
X = ready_clean['x y z day month hour_y street_y road_type jam_level'.split()]
y = np.where(ready_clean['Labels']==True,1,0)

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, stratify=y, test_size=.2, random_state=2020)

In [None]:
X_train_val.head()

In [None]:
y_train_val

In [None]:
logit_new_pipe.fit(X_train_val, y_train_val)

In [None]:
import joblib
joblib.dump(logit_new_pipe, 'logit_new')

In [None]:
logit_new_pipe

In [None]:
y_pred = logit_new_pipe.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
plot_roc_curve(logit_new_pipe, X_test, y_test)

In [None]:
import joblib
# joblib.dump(logit_new_pipe, 'logit_new')

In [None]:
logit_new = joblib.load('logit_new')

In [None]:
logit_new

In [None]:
y_prob1 = logit_new.predict_proba(X_test)[:,1]

In [None]:
fpr, tpr, thr = roc_curve(y_test, y_prob1)
roc_df = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'thr': thr})
roc_df[roc_df['tpr'].between(0.95,1)]

In [None]:
# 0.889415 macro 0.67
# 0.875739 macro 0.67
# 0.797893 macro 0.74

new_thr = 0.206125

y_pred088 = np.where(logit_new.predict_proba(X_test)[:,1]>new_thr,1,0)
print(classification_report(y_test, y_pred088))
display(confusion_matrix(y_test, y_pred088))

In [None]:
final_logit = logit_new.fit(X, y)

In [None]:
logit_new

In [None]:
final_logit

In [None]:
new_test = pd.read_csv('test_set.csv')
new_test

In [None]:
X_train_val.head()

In [None]:
import s2cell

In [None]:
def ids_to_df(df):
    test_df2 = pd.DataFrame()
    test_df2['s2cell_token'] = df['Ids'].str.split('_').apply(lambda x: x[0])
    test_df2['date'] = df['Ids'].str.split('_').apply(lambda x: x[1])
    test_df2['date'] = test_df2['date'].str.split('-')
    test_df2['hour_y'] = df['Ids'].str.split('_').apply(lambda x: x[2])
    test_df2['token_hour'] = test_df2['s2cell_token']+'_'+test_df2['hour_y'] 
    
    test_df2['year'] = test_df2['date'].apply(lambda x: int(x[0]))
    test_df2['month'] = test_df2['date'].apply(lambda x: int(x[1]))
    test_df2['day'] = test_df2['date'].apply(lambda x: int(x[2]))
    test_df2['hour_y'] = test_df2['hour_y'].astype('int')
    
    test_df2['lat'] = test_df2['s2cell_token'].apply(s2cell.token_to_lat_lon).apply(lambda x: x[0])
    test_df2['lon'] = test_df2['s2cell_token'].apply(s2cell.token_to_lat_lon).apply(lambda x: x[1])
    
    test_df2['x'] = np.cos(test_df2['lat']) * np.cos(test_df2['lon'])
    test_df2['y'] = np.cos(test_df2['lat']) * np.sin(test_df2['lon'])
    test_df2['z'] = np.sin(test_df2['lat'])
    
    test_df2 = test_df2[['token_hour', 'x', 'y', 'z', 'day', 'month', 'hour_y']]
    
    return test_df2

In [None]:
new_test = ids_to_df(test)

In [None]:
len(new_test)

In [None]:
# db_street
# db_road_type
# db_jam_level

In [None]:
merge_street = pd.merge(new_test, db_street, how='left')

In [None]:
merge_road = pd.merge(merge_street, db_road_type, how='left')
merge_road

In [None]:
merge_jam_level = pd.merge(merge_road, db_jam_level, how='left')
merge_jam_level

In [None]:
merge_jam_level.isna().sum()/len(merge_jam_level)*100

In [None]:
merge_jam_level[['street_y', 'road_type', 'jam_level']].describe(include='all')

In [None]:
merge_jam_level['street_y'].fillna(value='Cibarusah Raya', inplace=True)
merge_jam_level['road_type'].fillna(value=2, inplace=True)
merge_jam_level['jam_level'].fillna(value=3, inplace=True)

In [None]:
bismillah = merge_jam_level.drop(['Unnamed: 0', 's2token_15'], axis=1)

In [None]:
bismillah

In [None]:
X_train_val.head()

In [None]:
new_thr = 0.206125

bismillah['Labels'] = np.where(logit_new.predict_proba(bismillah)[:,1]>new_thr,True,False)

In [None]:
bismillah

In [None]:
data_test = pd.read_csv('data_test.csv')

In [None]:
data_test['Labels'] = bismillah['Labels']

In [None]:
data_test.to_csv('submit3.csv', index=False)

In [None]:
data_test

## Using XGBoost

In [None]:
from xgboost.sklearn import XGBClassifier
from sklearn.feature_selection import SequentialFeatureSelector

In [None]:
import sklearn

In [None]:
sklearn.__version__

In [None]:
transformer = ColumnTransformer([
    ('poly', PolynomialFeatures(degree=3), ['x', 'y', 'z', 'speed_decrease_%']),
    ('kbins', KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform'), ['day', 'hour_y'])
], remainder='passthrough')

xgb_pipe = Pipeline([
    ('trans', transformer),
    ('fs', RFE(DecisionTreeClassifier(max_depth=200, min_samples_split=100, max_features='sqrt', random_state=2020))),
    ('clf', XGBClassifier(n_estimators=100, use_label_encoder=False, max_depth = 200, learning_rate=0.1, verbosity=2, booster='gbtree', n_jobs=-1))
])

In [None]:
ready['x'] = np.cos(ready['lat']) * np.cos(ready['lon'])
ready['y'] = np.cos(ready['lat']) * np.sin(ready['lon'])
ready['z'] = np.sin(ready['lat'])
ready.drop(['lat', 'lon'], axis=1, inplace=True)

In [None]:
ready

In [None]:
ready.rename(columns={'speed_decreasement_%':'speed_decrease_%'}, inplace=True)

In [None]:
ready.duplicated('s2token_15').sum()

In [93]:
ready.to_csv('ready_speed.csv')

In [None]:
X = ready['x y z day month hour_y road_type jam_level speed_decrease_%'.split()]
y = np.where(ready['Labels']==True,1,0)

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X,y,test_size=.2,stratify=y,random_state=2021)

In [None]:
xgb_pipe.fit(X_train_val, y_train_val)