In [46]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

## Load dataset

In [47]:
df = pd.read_csv('datapadi.csv', sep=";", names=['bulan','luas','curah_hujan','hama','hasil'])
df.head()

Unnamed: 0,bulan,luas,curah_hujan,hama,hasil
0,Bulan Tanam,Luas Panen (X1),Curah Hujan (X2),Hama (X3),Hasil Panen
1,Jan – Apr 2009,14210,Baik (10),10.70%,91862
2,Mei – Ags 2009,10863,Kurang (-20),12%,68244
3,Sep – Des 2009,3185,Kurang (-20),11.90%,22737
4,Jan – Apr 2010,12254,Baik (10),5.90%,81121


In [48]:
df.shape

(28, 5)

## Pre-processing

In [49]:
df.drop([0], inplace=True)

In [50]:
# Bersihkan data curah hujan
df['curah_hujan'] = df.curah_hujan.str.extract('(-?\d+)') 

In [51]:
# Bersihkan data hama
df['hama'] = df.apply(lambda x:x['hama'][:-1], axis=1)

In [52]:
# Cek tipe data
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27 entries, 1 to 27
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   bulan        27 non-null     object
 1   luas         27 non-null     object
 2   curah_hujan  27 non-null     object
 3   hama         27 non-null     object
 4   hasil        27 non-null     object
dtypes: object(5)
memory usage: 1.3+ KB
None


In [53]:
# Convert to number(int, float)
df["luas"] = df.luas.astype(int)
df["curah_hujan"] = df.curah_hujan.astype(int)
df["hama"] = df.hama.astype(float)
df["hasil"] = df.hasil.astype(int)

In [54]:
df.head()

Unnamed: 0,bulan,luas,curah_hujan,hama,hasil
1,Jan – Apr 2009,14210,10,10.7,91862
2,Mei – Ags 2009,10863,-20,12.0,68244
3,Sep – Des 2009,3185,-20,11.9,22737
4,Jan – Apr 2010,12254,10,5.9,81121
5,Mei – Ags 2010,11851,-20,75.0,63034


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27 entries, 1 to 27
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   bulan        27 non-null     object 
 1   luas         27 non-null     int64  
 2   curah_hujan  27 non-null     int64  
 3   hama         27 non-null     float64
 4   hasil        27 non-null     int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 1.3+ KB


## Split dataset

In [56]:
X = df[['luas','curah_hujan','hama']]
y = df['hasil']

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
print(X_train.shape)
print(X_test.shape)

(21, 3)
(6, 3)


In [59]:
print(y_train.shape)
print(y_test.shape)

(21,)
(6,)


## Model regresi

In [60]:
regr = LinearRegression()
regr.fit(X_train, y_train)

LinearRegression()

In [61]:
# Mendapatkan nilai coef & intercept
print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

Intercept: 
 8307.492737664026
Coefficients: 
 [   5.92956964  118.24757043 -175.73647293]


In [62]:
# Menampilkan hasil prediksi
pred = regr.predict(X_test)
print(pred)

[33321.76898217 74015.01452205 88345.47169718 76959.16171269
 91868.77277834 34637.45483865]


## Evaluasi model

In [63]:
from sklearn import metrics

In [64]:
rmse = np.sqrt(metrics.mean_squared_error(np.array(y_test), pred))

In [65]:
rmse

4.39425333515643

## Komparasi dg Normalisasi data

In [66]:
# Menggunakan MinMax
from sklearn.preprocessing import MinMaxScaler

In [67]:
scaler = MinMaxScaler()

In [71]:
minmax = scaler.fit(X)
X1 = scaler.transform(X)

In [72]:
print(X1)

[[1.         0.75       0.13108108]
 [0.69641723 0.         0.14864865]
 [0.         0.         0.1472973 ]
 [0.82258503 0.75       0.06621622]
 [0.78603175 0.         1.        ]
 [0.31165533 0.75       0.18243243]
 [0.92544218 0.75       0.08783784]
 [0.86376417 0.         0.30675676]
 [0.12861678 0.75       0.2527027 ]
 [0.92244898 0.75       0.01216216]
 [0.82594104 1.         0.08378378]
 [0.12462585 0.5        0.04054054]
 [0.89950113 0.75       0.12837838]
 [0.83519274 0.         0.4027027 ]
 [0.36390023 0.5        0.06081081]
 [0.94430839 0.75       0.35945946]
 [0.72399093 0.75       0.09324324]
 [0.2030839  0.         0.35810811]
 [0.89079365 0.75       0.        ]
 [0.72417234 0.         0.14189189]
 [0.20698413 1.         0.02702703]
 [0.81306122 0.75       0.33783784]
 [0.83020408 0.         0.72972973]
 [0.21197279 0.5        0.31216216]
 [0.89823129 1.         0.        ]
 [0.82603175 0.         0.80945946]
 [0.1324263  0.         0.53378378]]


In [73]:
# Split kembali dataset
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y, test_size=0.2, random_state=42)

In [74]:
regr1 = LinearRegression()
regr1.fit(X1_train, y1_train)

LinearRegression()

In [75]:
print('Intercept: \n', regr1.intercept_)
print('Coefficients: \n', regr1.coef_)

Intercept: 
 24652.48416226328
Coefficients: 
 [ 65373.50529055   4729.90281726 -13004.4989968 ]


In [77]:
pred1 = regr1.predict(X1_test)
print(pred1)

[33321.76898217 74015.01452205 88345.47169718 76959.16171269
 91868.77277834 34637.45483865]


In [78]:
rmse1 = np.sqrt(metrics.mean_squared_error(np.array(y1_test), pred))
rmse1

4.394253335146202