# memprediksi lalu lintas (total record)

### Import Library

In [21]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

### Import Data

In [22]:
df = pd.read_csv("aggregate_alerts_Kota Bogor.csv")
df.head()

Unnamed: 0,id,time,kemendagri_kabupaten_kode,kemendagri_kabupaten_nama,street,type,avg_location,total_records,date
0,5930177,2022-07-06 00:00:00.000,32.71,KOTA BOGOR,KH Soleh Iskandar (Jalur Lambat),ROAD_CLOSED,"[106.80051600000002, -6.561997000000004]",61,2022-07-06
1,5930178,2022-07-06 00:00:00.000,32.71,KOTA BOGOR,,WEATHERHAZARD,"[106.80159300000008, -6.555780000000009]",61,2022-07-06
2,5930179,2022-07-06 01:00:00.000,32.71,KOTA BOGOR,KH Soleh Iskandar (Jalur Lambat),ROAD_CLOSED,"[106.80051600000002, -6.561997000000004]",60,2022-07-06
3,5930180,2022-07-06 01:00:00.000,32.71,KOTA BOGOR,,WEATHERHAZARD,"[106.80159300000008, -6.555780000000009]",60,2022-07-06
4,5930181,2022-07-06 02:00:00.000,32.71,KOTA BOGOR,KH Soleh Iskandar (Jalur Lambat),ROAD_CLOSED,"[106.80051600000002, -6.561997000000004]",60,2022-07-06


### Data Understanding

Variabel -  variabel yang terdapat pada dataset ini sebagai berikut :
1. id : ID Baris
2. time : waktu (tiap jam)
3. kemendagri_kabupaten_kode : kode kota kemendagri
4. kemendagri_kabupaten_nama : nama kota kemendagri
5. street : nama jalan
6. type : tipe kemacetan
7. avg_location : Tipe data geometri (data spasial)
8. total_records : Total data yang direkam dalam waktu tertentu



In [26]:
df.info ()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9762 entries, 0 to 9761
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         9762 non-null   int64  
 1   time                       9762 non-null   object 
 2   kemendagri_kabupaten_kode  9762 non-null   float64
 3   kemendagri_kabupaten_nama  9762 non-null   object 
 4   street                     8827 non-null   object 
 5   type                       9762 non-null   object 
 6   avg_location               9762 non-null   object 
 7   total_records              9762 non-null   int64  
 8   date                       9762 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 686.5+ KB


### Data Cleansing

In [25]:
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.shape[0]).sort_values(ascending=False)
missing_data = pd.concat([total,percent], axis=1, keys=['Total','Percent'])
missing_data

Unnamed: 0,Total,Percent
street,935,0.09578
id,0,0.0
time,0,0.0
kemendagri_kabupaten_kode,0,0.0
kemendagri_kabupaten_nama,0,0.0
type,0,0.0
avg_location,0,0.0
total_records,0,0.0
date,0,0.0


In [6]:
# mengecek street kosong dari kelompok avg_location
groupby_street = df.groupby('avg_location')['street'].unique().reset_index(name='street_name')
groupby_street.to_csv('groupby_street.csv', index=False)

Dikarenakan `avg_location` pada kolom `street` yang kosong tidak memiliki hubungan kesamaan `avg_location` lain yang kolom `street` terisi, maka row yang berisi `street` kosong didrop.

In [7]:
# drop missing values
df = df.dropna()

## Data Preprocessing

In [8]:
# time processing
df['time'] =  pd.to_datetime(df['time'])
days = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}

df['hour'] = df['time'].dt.hour
df['date'] = df['time'].dt.day
df['day_of_week'] = df['time'].dt.dayofweek
df.head(5)

Unnamed: 0,id,time,kemendagri_kabupaten_kode,kemendagri_kabupaten_nama,street,type,avg_location,total_records,date,hour,day_of_week
0,5930177,2022-07-06 00:00:00,32.71,KOTA BOGOR,KH Soleh Iskandar (Jalur Lambat),ROAD_CLOSED,"[106.80051600000002, -6.561997000000004]",61,6,0,2
2,5930179,2022-07-06 01:00:00,32.71,KOTA BOGOR,KH Soleh Iskandar (Jalur Lambat),ROAD_CLOSED,"[106.80051600000002, -6.561997000000004]",60,6,1,2
4,5930181,2022-07-06 02:00:00,32.71,KOTA BOGOR,KH Soleh Iskandar (Jalur Lambat),ROAD_CLOSED,"[106.80051600000002, -6.561997000000004]",60,6,2,2
6,5930183,2022-07-06 03:00:00,32.71,KOTA BOGOR,KH Soleh Iskandar (Jalur Lambat),ROAD_CLOSED,"[106.80051600000002, -6.561997000000004]",60,6,3,2
9,5930186,2022-07-06 04:00:00,32.71,KOTA BOGOR,Tol Jagorawi,WEATHERHAZARD,"[106.83445399999998, -6.590205999999997]",26,6,4,2


In [9]:
# standardscaler untuk street dan type
le_street = preprocessing.LabelEncoder()
df['street'] = le_street.fit_transform(df.street.values)

le_type = preprocessing.LabelEncoder()
df['type'] = le_street.fit_transform(df.type.values)
df.head()

Unnamed: 0,id,time,kemendagri_kabupaten_kode,kemendagri_kabupaten_nama,street,type,avg_location,total_records,date,hour,day_of_week
0,5930177,2022-07-06 00:00:00,32.71,KOTA BOGOR,107,2,"[106.80051600000002, -6.561997000000004]",61,6,0,2
2,5930179,2022-07-06 01:00:00,32.71,KOTA BOGOR,107,2,"[106.80051600000002, -6.561997000000004]",60,6,1,2
4,5930181,2022-07-06 02:00:00,32.71,KOTA BOGOR,107,2,"[106.80051600000002, -6.561997000000004]",60,6,2,2
6,5930183,2022-07-06 03:00:00,32.71,KOTA BOGOR,107,2,"[106.80051600000002, -6.561997000000004]",60,6,3,2
9,5930186,2022-07-06 04:00:00,32.71,KOTA BOGOR,223,3,"[106.83445399999998, -6.590205999999997]",26,6,4,2


In [10]:
# feature (input) : kemendagri_kabupaten_kode, street, hour, day_of_week
# output prediksi : tipe kondisi jalan (type)
X = df[["kemendagri_kabupaten_kode","street","hour","day_of_week"]]
y = df["type"]

In [11]:
# split data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Modelling
*supervised learning*, *classification*

In [12]:
## Hyperparameter Tuning for RandomForestClassifier
RF = RandomForestClassifier()

grid_param = {
    "n_estimators" : [90,100,120,130],
    'criterion' : ['gini','entropy'],
    'max_depth' : range(10,20,2),
    'max_features' : ['auto','log2']
}
grid_search = GridSearchCV(estimator=RF,param_grid=grid_param,cv=5,n_jobs=-1,verbose=3)
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


In [13]:
grid_search.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'log2',
 'n_estimators': 120}

In [20]:
# Training model
RF = RandomForestClassifier(criterion='gini',max_depth=10,
                            max_features='auto',n_estimators=100)
RF.fit(X_train,y_train)

  warn(


In [15]:
# Performa model
print("Accuracy score training set:",accuracy_score(y_train,RF.predict(X_train)))

Accuracy score training set: 0.8678657413964028


### Evaluation

In [18]:
print("Accuracy score testing set:",accuracy_score(y_test,RF.predict(X_test)))

Accuracy score testing set: 0.8329558323895809


In [19]:
print("Classification report:")
print(classification_report(y_test, RF.predict(X_test)))

Classification report:
              precision    recall  f1-score   support

           0       0.50      0.04      0.07        27
           1       0.83      0.93      0.88      1080
           2       0.97      0.98      0.97       270
           3       0.71      0.53      0.61       389

    accuracy                           0.83      1766
   macro avg       0.75      0.62      0.63      1766
weighted avg       0.82      0.83      0.82      1766

