In [None]:
# Import libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler


In [None]:
# Import data

train_data = pd.read_csv(r"train_data.csv")
valid_data = pd.read_csv(r"valid_data.csv")
test_data = pd.read_csv(r"test_data.csv")


In [None]:
# Separate Features and Target

X_train = train_data.drop(columns=["busy_ratio"])
y_train = train_data["busy_ratio"]

X_valid = valid_data.drop(columns=["busy_ratio"])
y_valid = valid_data["busy_ratio"]

X_test = test_data.drop(columns=["busy_ratio"])
y_test = test_data["busy_ratio"]


In [None]:
import numpy as np

# Check for infinite values
print(np.isinf(X_train).sum())  # This will print the number of infinite values in X_train
print(np.isinf(X_valid).sum())  # Same for validation data

# Check for very large values
print((np.abs(X_train) > 1e10).sum())  # Count of values larger than a threshold
print((np.abs(X_valid) > 1e10).sum())  # Same for validation data


total_items                          0
subtotal                             0
num_distinct_items                   0
min_item_price                       0
promo_item                           0
                                    ..
store_primary_category_vietnamese    0
SMA                                  0
EMA                                  0
SMA_diff                             0
EMA_diff                             0
Length: 87, dtype: int64
total_items                          0
subtotal                             0
num_distinct_items                   0
min_item_price                       0
promo_item                           0
                                    ..
store_primary_category_vietnamese    0
SMA                                  0
EMA                                  0
SMA_diff                             0
EMA_diff                             0
Length: 87, dtype: int64
total_items                          0
subtotal                             0
num_distinct_i

In [None]:
# Replace infinite values with NaN
X_train = np.where(np.isinf(X_train), np.nan, X_train)
X_valid = np.where(np.isinf(X_valid), np.nan, X_valid)

# Optionally, fill NaN values with the column mean or median
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')  # Or use 'median' instead of 'mean'
X_train = imputer.fit_transform(X_train)
X_valid = imputer.transform(X_valid)


In [None]:
scaler = StandardScaler()

# Fit and transform the training data, then transform the validation data
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)


In [None]:
print(np.max(X_train_scaled), np.min(X_train_scaled))
print(np.max(X_valid_scaled), np.min(X_valid_scaled))


371.74319092620914 -30.624953282556728
371.7431909262091 -29.10332211300415


In [None]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Define and train Lasso model
lasso = Lasso(alpha=0.01)  # alpha is the regularization strength, adjust as needed
lasso.fit(X_train_scaled, y_train)

# Predictions
y_pred_train = lasso.predict(X_train_scaled)
y_pred_valid = lasso.predict(X_valid_scaled)

# Evaluate model
train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
valid_rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)
train_r2 = r2_score(y_train, y_pred_train)
valid_r2 = r2_score(y_valid, y_pred_valid)

print(f"Lasso Regression:")
print(f"Train RMSE: {train_rmse:.4f}, Validation RMSE: {valid_rmse:.4f}")
print(f"Train R²: {train_r2:.4f}, Validation R²: {valid_r2:.4f}")


Lasso Regression:
Train RMSE: 0.0133, Validation RMSE: 0.0145
Train R²: 0.9988, Validation R²: 0.9989




In [None]:
from sklearn.tree import DecisionTreeRegressor

# Define and train Decision Tree model
dtree = DecisionTreeRegressor(max_depth=10)  # Adjust max_depth or other hyperparameters
dtree.fit(X_train_scaled, y_train)

# Predictions
y_pred_train = dtree.predict(X_train_scaled)
y_pred_valid = dtree.predict(X_valid_scaled)

# Evaluate model
train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
valid_rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)
train_r2 = r2_score(y_train, y_pred_train)
valid_r2 = r2_score(y_valid, y_pred_valid)

print(f"Decision Tree Regressor:")
print(f"Train RMSE: {train_rmse:.4f}, Validation RMSE: {valid_rmse:.4f}")
print(f"Train R²: {train_r2:.4f}, Validation R²: {valid_r2:.4f}")


Decision Tree Regressor:
Train RMSE: 0.0296, Validation RMSE: 0.0521
Train R²: 0.9942, Validation R²: 0.9851




In [None]:
from sklearn.ensemble import RandomForestRegressor

# Define and train Random Forest model
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)  # Hyperparameters to adjust
rf.fit(X_train_scaled, y_train)

# Predictions
y_pred_train = rf.predict(X_train_scaled)
y_pred_valid = rf.predict(X_valid_scaled)

# Evaluate model
train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
valid_rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)
train_r2 = r2_score(y_train, y_pred_train)
valid_r2 = r2_score(y_valid, y_pred_valid)

print(f"Random Forest Regressor:")
print(f"Train RMSE: {train_rmse:.4f}, Validation RMSE: {valid_rmse:.4f}")
print(f"Train R²: {train_r2:.4f}, Validation R²: {valid_r2:.4f}")


Random Forest Regressor:
Train RMSE: 0.0175, Validation RMSE: 0.0500
Train R²: 0.9980, Validation R²: 0.9863




In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Define and train XGBoost model
xgboost = xgb.XGBRegressor(n_estimators=100, max_depth=10, learning_rate=0.1, random_state=42)  # Hyperparameters to adjust
xgboost.fit(X_train_scaled, y_train)

# Predictions
y_pred_train = xgboost.predict(X_train_scaled)
y_pred_valid = xgboost.predict(X_valid_scaled)

# Evaluate model
train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
valid_rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)
train_r2 = r2_score(y_train, y_pred_train)
valid_r2 = r2_score(y_valid, y_pred_valid)

print(f"XGBoost Regressor:")
print(f"Train RMSE: {train_rmse:.4f}, Validation RMSE: {valid_rmse:.4f}")
print(f"Train R²: {train_r2:.4f}, Validation R²: {valid_r2:.4f}")


XGBoost Regressor:
Train RMSE: 0.0078, Validation RMSE: 0.1884
Train R²: 0.9996, Validation R²: 0.8055




In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define and train AdaBoost model
adaboost = AdaBoostRegressor(
    estimator=DecisionTreeRegressor(max_depth=3),  # Base learner
    n_estimators=100,  # Number of weak learners
    learning_rate=0.1,  # Shrinks contribution of each tree
    random_state=42
)
adaboost.fit(X_train_scaled, y_train)

# Predictions
y_pred_train = adaboost.predict(X_train_scaled)
y_pred_valid = adaboost.predict(X_valid_scaled)

# Evaluate model
train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
valid_rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)
train_r2 = r2_score(y_train, y_pred_train)
valid_r2 = r2_score(y_valid, y_pred_valid)

print(f"AdaBoost Regressor:")
print(f"Train RMSE: {train_rmse:.4f}, Validation RMSE: {valid_rmse:.4f}")
print(f"Train R²: {train_r2:.4f}, Validation R²: {valid_r2:.4f}")


AdaBoost Regressor:
Train RMSE: 0.2397, Validation RMSE: 0.2525
Train R²: 0.6193, Validation R²: 0.6507




Lasso-Best Model

In [None]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Assuming you already have X_train, X_valid, y_train, and y_valid defined
# Scale your data
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform on training data
X_valid_scaled = scaler.transform(X_valid)      # Only transform on validation data

# Define and train Lasso model with alpha (regularization strength)
lasso = Lasso(alpha=0.01)
lasso.fit(X_train_scaled, y_train)

# Predictions
y_train_pred = lasso.predict(X_train_scaled)
y_valid_pred = lasso.predict(X_valid_scaled)  # Use X_valid_scaled instead of X_val

# Evaluate model
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
valid_rmse = mean_squared_error(y_valid, y_valid_pred, squared=False)
train_r2 = r2_score(y_train, y_train_pred)
valid_r2 = r2_score(y_valid, y_valid_pred)

# Print results
print(f"Lasso Regression:")
print(f"Train RMSE: {train_rmse:.4f}, Validation RMSE: {valid_rmse:.4f}")
print(f"Train R²: {train_r2:.4f}, Validation R²: {valid_r2:.4f}")


Lasso Regression:
Train RMSE: 0.0133, Validation RMSE: 0.0145
Train R²: 0.9988, Validation R²: 0.9989


