In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor

In [94]:
df = pd.read_csv('../../data/preprocessed/main-data.csv')

In [95]:
# convert day to numerical
le = LabelEncoder()
df['day'] = le.fit_transform(df['day'])
df.drop(['category_names','packet_date'], axis=1, inplace=True)

df['is_vip'] = df['is_vip'].astype(int)

# Define features and target variable
X = df[df.columns.difference(['collection_duration'])]
y = df['collection_duration']

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [97]:
# Bagging
bagging_model = BaggingRegressor(AdaBoostRegressor(),n_estimators=100, random_state=42)
bagging_model.fit(X_train, y_train)
y_pred = bagging_model.predict(X_test)

In [98]:
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error on Test Set: {rmse:.4f}")
print(f"Mean Absolute Error on Test Set: {mae:.4f}")
print(f"R^2 Score on Test Set: {r2:.4f}")


Root Mean Squared Error on Test Set: 2.0635
Mean Absolute Error on Test Set: 1.6688
R^2 Score on Test Set: 0.3118


In [99]:
p_accuracy = [(1 - np.abs(pred - actual) / actual) * 100 for pred, actual in zip(y_pred, y_test)]

print(f"Prediction Accuracy: {np.mean(p_accuracy):.2f}%")

Prediction Accuracy: 65.48%
