In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('jumlah_kendaraan_bermotor.csv')

In [4]:
data.sample(8)

Unnamed: 0,id,kode_provinsi,nama_provinsi,kode_kabupaten_kota,nama_kabupaten_kota,cabang_pelayanan,jumlah_kendaraan,satuan,tahun
63,64,32,JAWA BARAT,3276,KOTA DEPOK,KOTA DEPOK I,440241,UNIT,2014
78,79,32,JAWA BARAT,3208,KABUPATEN KUNINGAN,KABUPATEN KUNINGAN,207374,UNIT,2015
231,232,32,JAWA BARAT,3274,KOTA CIREBON,KOTA CIREBON,189435,UNIT,2019
119,120,32,JAWA BARAT,3213,KABUPATEN SUBANG,KABUPATEN SUBANG,243474,UNIT,2016
207,208,32,JAWA BARAT,3203,KABUPATEN CIANJUR,KABUPATEN CIANJUR,465861,UNIT,2019
96,97,32,JAWA BARAT,3275,KOTA BEKASI,KOTA BEKASI,947892,UNIT,2015
114,115,32,JAWA BARAT,3209,KABUPATEN CIREBON,KABUPATEN CIREBON II CLDG,144764,UNIT,2016
115,116,32,JAWA BARAT,3210,KABUPATEN MAJALENGKA,KABUPATEN MAJALENGKA,247004,UNIT,2016


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   272 non-null    int64 
 1   kode_provinsi        272 non-null    int64 
 2   nama_provinsi        272 non-null    object
 3   kode_kabupaten_kota  272 non-null    int64 
 4   nama_kabupaten_kota  272 non-null    object
 5   cabang_pelayanan     272 non-null    object
 6   jumlah_kendaraan     272 non-null    int64 
 7   satuan               272 non-null    object
 8   tahun                272 non-null    int64 
dtypes: int64(5), object(4)
memory usage: 19.2+ KB


In [6]:
data['tahun'] = data['tahun'].astype(str)

In [7]:
data['tahun'].unique()

array(['2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020'],
      dtype=object)

In [8]:
from sklearn.preprocessing import LabelEncoder
LE_tahun = LabelEncoder()
data['tahun'] = LE_tahun.fit_transform(data['tahun'])
data['tahun'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7])

In [9]:
data['nama_kabupaten_kota'].unique()

array(['KABUPATEN BOGOR', 'KABUPATEN SUKABUMI', 'KABUPATEN CIANJUR',
       'KABUPATEN BANDUNG', 'KABUPATEN GARUT', 'KABUPATEN TASIKMALAYA',
       'KABUPATEN CIAMIS', 'KABUPATEN KUNINGAN', 'KABUPATEN CIREBON',
       'KABUPATEN MAJALENGKA', 'KABUPATEN SUMEDANG',
       'KABUPATEN INDRAMAYU', 'KABUPATEN SUBANG', 'KABUPATEN PURWAKARTA',
       'KABUPATEN KARAWANG', 'KABUPATEN BEKASI',
       'KABUPATEN BANDUNG BARAT', 'KOTA BOGOR', 'KOTA SUKABUMI',
       'KOTA BANDUNG', 'KOTA CIREBON', 'KOTA BEKASI', 'KOTA DEPOK',
       'KOTA CIMAHI', 'KOTA TASIKMALAYA', 'KOTA BANJAR',
       'KABUPATEN PANGANDARAN'], dtype=object)

In [10]:
LE_kabupaten = LabelEncoder()
data['nama_kabupaten_kota'] = LE_kabupaten.fit_transform(data['nama_kabupaten_kota'])
data['nama_kabupaten_kota'].unique()

array([ 3, 15,  5,  0,  7, 17,  4, 10,  6, 11, 16,  8, 14, 13,  9,  2,  1,
       21, 25, 18, 23, 20, 24, 22, 26, 19, 12])

In [11]:
x = data[['nama_kabupaten_kota','tahun']]
y = data['jumlah_kendaraan']

In [12]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(x,y)

LinearRegression()

In [13]:
y_pred = linreg.predict(x)

In [14]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
error = np.sqrt(mean_squared_error(y, y_pred))

In [15]:
error

277732.7730310014

In [16]:
from sklearn.tree import DecisionTreeRegressor
DTR = DecisionTreeRegressor(random_state=0)
DTR.fit(x, y)

DecisionTreeRegressor(random_state=0)

In [17]:
y_pred = DTR.predict(x)

In [18]:
error = np.sqrt(mean_squared_error(y, y_pred))
print("prediksi jumlah kendaraan : {:,.02f}".format(error))

prediksi jumlah kendaraan : 47,418.27


In [19]:
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(random_state=0)
RFR.fit(x, y)

RandomForestRegressor(random_state=0)

In [20]:
y_pred = RFR.predict(x)

In [21]:
error = np.sqrt(mean_squared_error(y, y_pred))
print("prediksi jumlah kendaraan : {:,.02f}".format(error))

prediksi jumlah kendaraan : 59,683.67


In [22]:
from sklearn.model_selection import GridSearchCV

max_depth = [None, 2,4,6,8,10,12]
parameters = {"max_depth": max_depth}

regressor = DecisionTreeRegressor(random_state=0)
gs = GridSearchCV(regressor, parameters, scoring='neg_mean_squared_error')
gs.fit(x, y)

GridSearchCV(estimator=DecisionTreeRegressor(random_state=0),
             param_grid={'max_depth': [None, 2, 4, 6, 8, 10, 12]},
             scoring='neg_mean_squared_error')

In [23]:
regressor = gs.best_estimator_

regressor.fit(x, y)
y_pred = regressor.predict(x)
error = np.sqrt(mean_squared_error(y, y_pred))
print("prediksi jumlah kendaraan : {:,.02f}".format(error))

prediksi jumlah kendaraan : 47,418.27


In [24]:
x

Unnamed: 0,nama_kabupaten_kota,tahun
0,3,0
1,15,0
2,15,0
3,5,0
4,0,0
...,...,...
267,24,7
268,24,7
269,22,7
270,26,7


In [25]:
x = np.array([['KABUPATEN BOGOR','2018']])
x

array([['KABUPATEN BOGOR', '2018']], dtype='<U15')

In [26]:
x[:, 0] = LE_kabupaten.transform(x[:,0])
x[:, 1] = LE_tahun.transform(x[:,1])
x = x.astype(float)
x

array([[3., 5.]])

In [27]:
y_pred = regressor.predict(x)
y_pred

array([93088.])

In [28]:
import pickle

In [29]:
datas = {"model": regressor, "nama_kabupaten_kota": LE_kabupaten, "tahun": LE_tahun}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(datas, file)

In [30]:
with open('saved_steps.pkl', 'rb') as file:
    datas = pickle.load(file)

regressor_loaded = datas["model"]
LE_kabupaten = datas["nama_kabupaten_kota"]
LE_tahun = datas["tahun"]

In [31]:
y_pred = regressor_loaded.predict(x)
y_pred

array([93088.])