# Pemodelan Data Penjualan

Notebook ini berisi pemodelan sederhana untuk memprediksi revenue berdasarkan data demografis.

In [None]:
# Import library yang diperlukan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Setting untuk visualisasi
%matplotlib inline

## 1. Membaca Data yang Sudah Dipersiapkan

In [None]:
# Membaca data training dan testing
X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv').iloc[:, 0]
y_test = pd.read_csv('../data/y_test.csv').iloc[:, 0]

# Memeriksa ukuran data
print("Ukuran data training:", X_train.shape)
print("Ukuran data testing:", X_test.shape)

## 2. Model Regresi Linear

In [None]:
# Membuat model regresi linear
lr_model = LinearRegression()

# Melatih model
lr_model.fit(X_train, y_train)

# Prediksi
y_pred_lr = lr_model.predict(X_test)

In [None]:
# Evaluasi model
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Hasil Evaluasi Model Regresi Linear:")
print(f"Mean Squared Error (MSE): {mse_lr:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_lr:.2f}")
print(f"Mean Absolute Error (MAE): {mae_lr:.2f}")
print(f"R² Score: {r2_lr:.4f}")

In [None]:
# Visualisasi hasil prediksi vs aktual
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_lr, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Revenue Aktual (Rp)')
plt.ylabel('Revenue Prediksi (Rp)')
plt.title('Prediksi vs Aktual - Model Regresi Linear')
plt.show()

In [None]:
# Melihat koefisien model
features = X_train.columns
coefficients = pd.DataFrame(lr_model.coef_, index=features, columns=['Coefficient'])
coefficients = coefficients.sort_values(by='Coefficient', ascending=False)

plt.figure(figsize=(10, 6))
coefficients.plot(kind='bar')
plt.title('Koefisien Model Regresi Linear')
plt.xlabel('Feature')
plt.ylabel('Koefisien')
plt.grid(axis='y')
plt.show()

## 3. Model Decision Tree

In [None]:
# Membuat model decision tree
dt_model = DecisionTreeRegressor(max_depth=5, random_state=42)

# Melatih model
dt_model.fit(X_train, y_train)

# Prediksi
y_pred_dt = dt_model.predict(X_test)

In [None]:
# Evaluasi model
mse_dt = mean_squared_error(y_test, y_pred_dt)
rmse_dt = np.sqrt(mse_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

print("Hasil Evaluasi Model Decision Tree:")
print(f"Mean Squared Error (MSE): {mse_dt:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_dt:.2f}")
print(f"Mean Absolute Error (MAE): {mae_dt:.2f}")
print(f"R² Score: {r2_dt:.4f}")

In [None]:
# Visualisasi hasil prediksi vs aktual
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_dt, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Revenue Aktual (Rp)')
plt.ylabel('Revenue Prediksi (Rp)')
plt.title('Prediksi vs Aktual - Model Decision Tree')
plt.show()

In [None]:
# Melihat feature importance
feature_importance = pd.DataFrame(dt_model.feature_importances_, index=features, columns=['Importance'])
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
feature_importance.plot(kind='bar')
plt.title('Feature Importance - Model Decision Tree')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.grid(axis='y')
plt.show()

## 4. Model Random Forest

In [None]:
# Membuat model random forest
rf_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)

# Melatih model
rf_model.fit(X_train, y_train)

# Prediksi
y_pred_rf = rf_model.predict(X_test)

In [None]:
# Evaluasi model
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Hasil Evaluasi Model Random Forest:")
print(f"Mean Squared Error (MSE): {mse_rf:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf:.2f}")
print(f"Mean Absolute Error (MAE): {mae_rf:.2f}")
print(f"R² Score: {r2_rf:.4f}")

In [None]:
# Visualisasi hasil prediksi vs aktual
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_rf, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Revenue Aktual (Rp)')
plt.ylabel('Revenue Prediksi (Rp)')
plt.title('Prediksi vs Aktual - Model Random Forest')
plt.show()

In [None]:
# Melihat feature importance
feature_importance_rf = pd.DataFrame(rf_model.feature_importances_, index=features, columns=['Importance'])
feature_importance_rf = feature_importance_rf.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
feature_importance_rf.plot(kind='bar')
plt.title('Feature Importance - Model Random Forest')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.grid(axis='y')
plt.show()

## 5. Perbandingan Model

In [None]:
# Membuat dataframe perbandingan
models = ['Linear Regression', 'Decision Tree', 'Random Forest']
mse_values = [mse_lr, mse_dt, mse_rf]
rmse_values = [rmse_lr, rmse_dt, rmse_rf]
mae_values = [mae_lr, mae_dt, mae_rf]
r2_values = [r2_lr, r2_dt, r2_rf]

comparison = pd.DataFrame({
    'Model': models,
    'MSE': mse_values,
    'RMSE': rmse_values,
    'MAE': mae_values,
    'R²': r2_values
})

print("Perbandingan Model:")
comparison

In [None]:
# Visualisasi perbandingan metrik
metrics = ['MSE', 'RMSE', 'MAE']
metric_values = [mse_values, rmse_values, mae_values]

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, (metric, values) in enumerate(zip(metrics, metric_values)):
    axes[i].bar(models, values)
    axes[i].set_title(f'Perbandingan {metric}')
    axes[i].set_ylabel(metric)
    axes[i].grid(axis='y')
    
plt.tight_layout()
plt.show()

# Visualisasi R² Score
plt.figure(figsize=(10, 5))
plt.bar(models, r2_values)
plt.title('Perbandingan R² Score')
plt.ylabel('R² Score')
plt.ylim(0, 1)  # R² biasanya antara 0 dan 1
plt.grid(axis='y')
plt.show()

## 6. Kesimpulan dan Model Terbaik

In [None]:
# Menentukan model terbaik berdasarkan R²
best_model_index = r2_values.index(max(r2_values))
best_model = models[best_model_index]

print(f"Model terbaik berdasarkan R² Score adalah: {best_model}")
print(f"Dengan R² Score: {max(r2_values):.4f}")

# Menyimpan model terbaik
import pickle

if best_model == 'Linear Regression':
    model_to_save = lr_model
elif best_model == 'Decision Tree':
    model_to_save = dt_model
else:
    model_to_save = rf_model
    
# Membuat direktori models jika belum ada
import os
if not os.path.exists('../models'):
    os.makedirs('../models')

# Menyimpan model dalam format pickle
with open('../models/best_model.pkl', 'wb') as file:
    pickle.dump(model_to_save, file)
    
print("Model terbaik telah disimpan sebagai 'best_model.pkl'")