In [23]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 1. Memuat dan membersihkan data
data = pd.read_csv('readyForModeling.csv')
data = data.replace(8888, np.nan)
data = data.dropna()

In [24]:
# 2. Menentukan fitur dan target
features = ['Tn', 'Tx', 'Tavg', 'RH_avg', 'ss', 'ff_x', 'ddd_x', 'ff_avg', 'ddd_car']
X = data[features]
y = data['RR']

In [25]:
# 3. Membagi data menjadi set pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Membuat column transformer untuk preprocessing
categorical_features = ['ddd_car']
numerical_features = [col for col in features if col != 'ddd_car']
column_transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', 'passthrough', numerical_features)
    ]
)


In [26]:
# 5. Membuat pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('regressor', RandomForestRegressor(random_state=42))
])

# 6. Menentukan grid parameter untuk penalaan hiperparameter
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 5, 10]
}

# 7. Mengatur GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# 8. Melatih model dengan GridSearchCV
grid_search.fit(X_train, y_train)




In [28]:
# 9. Mendapatkan model terbaik
best_model = grid_search.best_estimator_

# 10. Memprediksi pada data pengujian
y_pred = best_model.predict(X_test)

# 11. Mengevaluasi model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

# Menampilkan hasil
print(f"Parameter terbaik: {grid_search.best_params_}")
print(grid_search.score(X_train, y_train), grid_search.best_score_, grid_search.score(X_test, y_test))
print(f"MSE: {mse:.3f}, RMSE: {rmse:.3f}, MAE: {mae:.3f}")

Parameter terbaik: {'regressor__max_depth': None, 'regressor__n_estimators': 100}
-28.257457179487215 nan -196.58105601449276
MSE: 196.581, RMSE: 14.021, MAE: 8.570


In [21]:
data.sample(10)

Unnamed: 0,Tn,Tx,Tavg,RH_avg,RR,ss,ff_x,ddd_x,ff_avg,ddd_car
244,24.9,33.3,27.9,79.0,0.0,8.7,4.0,30.0,1.0,C
8,24.7,33.2,28.2,86.0,0.5,4.7,4.0,310.0,1.0,C
64,23.4,32.2,27.5,90.0,0.1,4.9,2.0,260.0,1.0,C
289,24.4,32.8,27.4,88.0,0.0,3.5,4.0,160.0,1.0,SE
120,23.5,30.5,25.6,94.0,0.3,4.4,3.0,150.0,0.0,C
7,23.5,33.3,27.9,83.0,3.5,1.5,5.0,170.0,1.0,C
246,24.3,32.5,27.2,88.0,0.0,9.9,4.0,290.0,1.0,C
335,23.7,33.4,28.0,84.0,0.0,1.9,4.0,320.0,1.0,C
111,23.5,30.0,25.9,92.0,12.1,8.7,2.0,170.0,0.0,C
134,24.0,30.6,26.3,91.0,7.3,1.2,4.0,250.0,1.0,C


In [22]:
#2. Menyiapkan data baru
# Pastikan fitur dan urutannya sama dengan data saat pelatihan
data_baru = {
    'Tn': 24.0,       # Suhu minimum
    'Tx': 30.6,       # Suhu maksimum
    'Tavg': 26.3,     # Suhu rata-rata
    'RH_avg': 91.0,   # Kelembapan rata-rata
    'ss': 1.2,        # Lama penyinaran matahari
    'ff_x': 4.0,      # Kecepatan angin maksimum
    'ddd_x': 250.0,   # Arah angin maksimum
    'ff_avg': 1.0,    # Kecepatan angin rata-rata
    'ddd_car': 'C'   # Arah angin dominan (kategori, misalnya 'SE' untuk Tenggara)
}

# Mengubah data baru menjadi DataFrame
data_baru_df = pd.DataFrame([data_baru])

# 3. Membuat prediksi
# Model akan menangani preprocessing seperti encoding 'ddd_car' jika sudah ada di pipeline
prediksi_rr = model.predict(data_baru_df)

# 4. Menampilkan hasil prediksi
print(f"Prediksi Curah Hujan (RR): {prediksi_rr[0]:.2f} mm")

Prediksi Curah Hujan (RR): 11.07 mm


