In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from imblearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report

import datetime as dt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
train = pd.read_csv('data_train.csv')
alerts = pd.read_csv('alerts.csv')
irreg = pd.read_csv('irregularities.csv')

# Preprocessing: Alerts

In [3]:
train.head()

Unnamed: 0,Ids,Labels
0,2e69e9384_2020-10-06_13,True
1,2e6992c7c_2020-10-02_17,True
2,2e69ef474_2020-09-13_19,True
3,2e69c5fd4_2020-10-10_15,True
4,2e6992134_2020-09-12_11,True


In [4]:
train.tail()

Unnamed: 0,Ids,Labels
71331,2e69eea5c_2020-11-09_10,False
71332,2e69c5944_2020-10-27_12,True
71333,2e69f2cd4_2020-11-07_14,True
71334,2e68e64e4_2020-09-23_9,False
71335,2e68e6084_2020-10-18_12,False


In [5]:
alerts.head()

Unnamed: 0,id,pub_millis,s2id_15,s2token_15,road_type,street,city,magvar,reliability,report_description,report_rating,confidence,type,subtype,report_by_municipality_user,n_thumbs_up,longitude,latitude
0,177876895,1603331480000,3344466888162803712,2e69eeea4,1,,Depok,0,9,,0,1,ROAD_CLOSED,ROAD_CLOSED_EVENT,,,106.788545,-6.359846
1,179156987,1604487892000,3344463130066419712,2e69eb7f4,6,N8 Jalan Raya Bogor,Depok,170,7,,1,1,JAM,JAM_HEAVY_TRAFFIC,,0.0,106.867141,-6.383855
2,181688703,1605666614000,3344367648648462336,2e6994a84,7,,Bekasi,0,8,,0,1,ROAD_CLOSED,ROAD_CLOSED_EVENT,,,106.921974,-6.379087
3,173055165,1601895721000,3344374458319110144,2e699ad9c,2,Flyover Tegal Gede,Cikarang,319,10,,5,0,WEATHERHAZARD,HAZARD_ON_ROAD_POT_HOLE,,,107.143656,-6.300441
4,173802602,1602464394000,3344466709921660928,2e69eec0c,2,Tanjakan Kembar,Depok,310,5,,3,0,JAM,JAM_HEAVY_TRAFFIC,,,106.79395,-6.365677


## Change millisecond to Date and Time

In [6]:
alerts['pub_date'] = [dt.datetime.fromtimestamp(i/1000.0) for i in alerts['pub_millis']]
alerts['pub_date'].head()

0   2020-10-22 08:51:20
1   2020-11-04 18:04:52
2   2020-11-18 09:30:14
3   2020-10-05 18:02:01
4   2020-10-12 07:59:54
Name: pub_date, dtype: datetime64[ns]

In [7]:
alerts.head()

Unnamed: 0,id,pub_millis,s2id_15,s2token_15,road_type,street,city,magvar,reliability,report_description,report_rating,confidence,type,subtype,report_by_municipality_user,n_thumbs_up,longitude,latitude,pub_date
0,177876895,1603331480000,3344466888162803712,2e69eeea4,1,,Depok,0,9,,0,1,ROAD_CLOSED,ROAD_CLOSED_EVENT,,,106.788545,-6.359846,2020-10-22 08:51:20
1,179156987,1604487892000,3344463130066419712,2e69eb7f4,6,N8 Jalan Raya Bogor,Depok,170,7,,1,1,JAM,JAM_HEAVY_TRAFFIC,,0.0,106.867141,-6.383855,2020-11-04 18:04:52
2,181688703,1605666614000,3344367648648462336,2e6994a84,7,,Bekasi,0,8,,0,1,ROAD_CLOSED,ROAD_CLOSED_EVENT,,,106.921974,-6.379087,2020-11-18 09:30:14
3,173055165,1601895721000,3344374458319110144,2e699ad9c,2,Flyover Tegal Gede,Cikarang,319,10,,5,0,WEATHERHAZARD,HAZARD_ON_ROAD_POT_HOLE,,,107.143656,-6.300441,2020-10-05 18:02:01
4,173802602,1602464394000,3344466709921660928,2e69eec0c,2,Tanjakan Kembar,Depok,310,5,,3,0,JAM,JAM_HEAVY_TRAFFIC,,,106.79395,-6.365677,2020-10-12 07:59:54


In [8]:
alerts['pub_date'].head(3)

0   2020-10-22 08:51:20
1   2020-11-04 18:04:52
2   2020-11-18 09:30:14
Name: pub_date, dtype: datetime64[ns]

## Split `dates` and `hour` into Two New Columns

In [9]:
alerts['dates'] = [i.strftime('%Y-%m-%d') for i in alerts['pub_date']]

In [10]:
alerts['hour'] = [str(i.hour) for i in alerts['pub_date']]

In [11]:
alerts.head()

Unnamed: 0,id,pub_millis,s2id_15,s2token_15,road_type,street,city,magvar,reliability,report_description,report_rating,confidence,type,subtype,report_by_municipality_user,n_thumbs_up,longitude,latitude,pub_date,dates,hour
0,177876895,1603331480000,3344466888162803712,2e69eeea4,1,,Depok,0,9,,0,1,ROAD_CLOSED,ROAD_CLOSED_EVENT,,,106.788545,-6.359846,2020-10-22 08:51:20,2020-10-22,8
1,179156987,1604487892000,3344463130066419712,2e69eb7f4,6,N8 Jalan Raya Bogor,Depok,170,7,,1,1,JAM,JAM_HEAVY_TRAFFIC,,0.0,106.867141,-6.383855,2020-11-04 18:04:52,2020-11-04,18
2,181688703,1605666614000,3344367648648462336,2e6994a84,7,,Bekasi,0,8,,0,1,ROAD_CLOSED,ROAD_CLOSED_EVENT,,,106.921974,-6.379087,2020-11-18 09:30:14,2020-11-18,9
3,173055165,1601895721000,3344374458319110144,2e699ad9c,2,Flyover Tegal Gede,Cikarang,319,10,,5,0,WEATHERHAZARD,HAZARD_ON_ROAD_POT_HOLE,,,107.143656,-6.300441,2020-10-05 18:02:01,2020-10-05,18
4,173802602,1602464394000,3344466709921660928,2e69eec0c,2,Tanjakan Kembar,Depok,310,5,,3,0,JAM,JAM_HEAVY_TRAFFIC,,,106.79395,-6.365677,2020-10-12 07:59:54,2020-10-12,7


## Combine `s2token_15`, `dates` and `hour` into ids for Merging with Train DF

In [12]:
alerts['ids'] = alerts['s2token_15']+'_'+alerts['dates']+'_'+alerts['hour']

In [16]:
alerts.head()

Unnamed: 0,id,pub_millis,s2id_15,s2token_15,road_type,street,city,magvar,reliability,report_description,report_rating,confidence,type,subtype,report_by_municipality_user,n_thumbs_up,longitude,latitude,pub_date,dates,hour,ids
0,177876895,1603331480000,3344466888162803712,2e69eeea4,1,,Depok,0,9,,0,1,ROAD_CLOSED,ROAD_CLOSED_EVENT,,,106.788545,-6.359846,2020-10-22 08:51:20,2020-10-22,8,2e69eeea4_2020-10-22_8
1,179156987,1604487892000,3344463130066419712,2e69eb7f4,6,N8 Jalan Raya Bogor,Depok,170,7,,1,1,JAM,JAM_HEAVY_TRAFFIC,,0.0,106.867141,-6.383855,2020-11-04 18:04:52,2020-11-04,18,2e69eb7f4_2020-11-04_18
2,181688703,1605666614000,3344367648648462336,2e6994a84,7,,Bekasi,0,8,,0,1,ROAD_CLOSED,ROAD_CLOSED_EVENT,,,106.921974,-6.379087,2020-11-18 09:30:14,2020-11-18,9,2e6994a84_2020-11-18_9
3,173055165,1601895721000,3344374458319110144,2e699ad9c,2,Flyover Tegal Gede,Cikarang,319,10,,5,0,WEATHERHAZARD,HAZARD_ON_ROAD_POT_HOLE,,,107.143656,-6.300441,2020-10-05 18:02:01,2020-10-05,18,2e699ad9c_2020-10-05_18
4,173802602,1602464394000,3344466709921660928,2e69eec0c,2,Tanjakan Kembar,Depok,310,5,,3,0,JAM,JAM_HEAVY_TRAFFIC,,,106.79395,-6.365677,2020-10-12 07:59:54,2020-10-12,7,2e69eec0c_2020-10-12_7


## Make Function to Simplifies The Proccess

In [20]:
def make_ids(df, col_millis, col_token):
    df['pub_date'] = [dt.datetime.fromtimestamp(i/1000.0) for i in df[col_millis]]
    df['dates'] = [i.strftime('%Y-%m-%d') for i in df['pub_date']]
    df['hour'] = [str(i.hour) for i in df['pub_date']]
    df['ids'] = df[col_token]+'_'+df['dates']+'_'+df['hour']
    return df

# Preprocessing `irregularities`

In [13]:
irreg.head()

Unnamed: 0,id,detection_date_millis,update_date_millis,street,city,is_highway,line,s2id_center,s2token_center,speed,regular_speed,delay_seconds,seconds,length,trend,type,severity,jam_level,drivers_count,alerts_count,n_thumbs_up
0,12868069,1604733149024,1604735467276,Jatiwaringin Raya,Bekasi,t,"{""line"": [{""x"": 106.91014, ""y"": -6.258107}, {""...",3344471185277583360,2e69f2d2c,13.03,17.15,299,432,1566,0,Small,5,3,13,0,0
1,12420463,1599906813144,1599909295834,Putri Tunggal,Depok,f,"{""line"": [{""x"": 106.887821, ""y"": -6.377016}, {...",3344462996922433536,2e69eb604,6.56,18.0,399,539,984,0,Small,5,3,5,0,0
2,12497533,1601728355356,1601734996933,Ir Haji Juanda,Bandung,f,"{""line"": [{""x"": 107.618629, ""y"": -6.87556}, {""...",3344176694402482176,2e68e6fc4,3.36,19.65,1185,1294,1212,1,Large,5,4,21,2,0
3,12536831,1602312860279,1602315706305,KH Muchtar Tabrani,Bekasi,f,"{""line"": [{""x"": 107.002934, ""y"": -6.216088}, {...",3344358143885836288,2e698c034,4.36,8.04,467,543,659,-1,Small,5,4,3,0,0
4,12327151,1598956623240,1598957378934,N1 Pangeran Diponegoro,Tambun Selatan,t,"{""line"": [{""x"": 107.035652, ""y"": -6.255471}, {...",3344360723013697536,2e698e5bc,4.74,16.71,423,474,625,0,Small,5,4,11,0,0


In [22]:
irreg_up = make_ids(irreg, 'update_date_millis', 's2token_center')

In [23]:
irreg_det = make_ids(irreg, 'detection_date_millis', 's2token_center')

# Joining