In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#  Machine Learning
!pip install scikit-learn xgboost lightgbm



In [4]:
#  Basic Utilities
import os
import sys
import math
import random
import warnings
from google.colab import drive
warnings.filterwarnings("ignore")

#  Data Handling

import pandas as pd
import numpy as np

#  Visualization

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
import plotly.graph_objects as go
import plotly.express as px
from mpl_toolkits.mplot3d import Axes3D

#  Statistics & Hypothesis Testing

from scipy import stats
from scipy.special import softmax

#  Feature Selection

from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, chi2, RFE

#  Preprocessing

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

#  Classical Machine Learning Models
from sklearn.model_selection import train_test_split
import xgboost as xgb

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

#  Model Evaluation

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, TimeSeriesSplit, RepeatedKFold
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, mean_absolute_percentage_error,
    r2_score, classification_report, confusion_matrix
)

# Time Series Utilities
from pandas.tseries.offsets import DateOffset
import datetime



#  Save & Load

import joblib
import pickle

#Load Dataset

In [5]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/MRP/Current/Deliverables/Methodology_&_Experiments/Report/Coding/Data_files/Data_Co.csv', encoding='latin-1')

In [6]:
df['order_date'] = pd.to_datetime(df['order_date'])
df = df.set_index('order_date')

In [7]:
# Drop irrelevant/leaky columns
columns_to_drop = [
        'Product Description', 'Product Image', 'Order Zipcode', 'Customer Email', 'Customer Password',
        'Customer Zipcode', 'Customer Lname', 'Customer Fname', 'Days for shipping (real)',
        'Benefit per order', 'Order Item Profit Ratio', 'Order Profit Per Order', 'Sales per customer',
        'Order Item Total', 'shipping date (DateOrders)', 'shipping_hour', 'shipping_week',
        'shipping_month', 'shipping_dayofweek', 'shipping_weekend', 'Category Id',
        'Order Item Cardprod Id', 'Order Customer Id', 'Order Item Id', 'Order Item Product Price',
        'Product Card Id', 'Order Item Discount Rate', 'Product Status', 'Order City', 'Order State',
        'Customer City', 'Customer State', 'Customer Street', 'Late_delivery_risk', 'date'

]

In [8]:
df.drop(columns=columns_to_drop, errors='ignore', inplace=True)

In [9]:
# Encode non-numeric features
non_numeric_cols = df.select_dtypes(exclude=np.number).columns
processed_cols = []

for col in non_numeric_cols:
    if df[col].nunique() < 50000:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str).fillna("missing"))
        processed_cols.append(col)

print("Processed non-numeric columns:", processed_cols)

Processed non-numeric columns: ['Type', 'Delivery Status', 'Category Name', 'Customer Country', 'Customer Segment', 'Department Name', 'Market', 'Order Country', 'Order Region', 'Order Status', 'Product Name', 'Shipping Mode', 'order_time', 'order_weekend', 'shipping_date', 'shipping_time']


In [10]:
# Resample 'Sales' to daily frequency
daily_sales = df['Sales'].resample('D').sum()

In [11]:
# Prepare X and y
X = df.loc[:, ~df.columns.isin(["Sales"])]
y = daily_sales

In [12]:
# Scale features
scaler = MinMaxScaler()
scaled_X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

In [13]:
# Resample to daily frequency
daily_scaled_X = scaled_X.resample('D').sum()

In [14]:
 # Feature selection
rfr = RandomForestRegressor(random_state=42)
rfr.fit(daily_scaled_X, y)

In [15]:
importances = pd.Series(rfr.feature_importances_, index=daily_scaled_X.columns)
selected_features = importances.sort_values(ascending=False).head(20).index.tolist()

In [16]:
print("Top selected features:", selected_features)

Top selected features: ['Order Item Discount', 'Product Price', 'Department Id', 'Latitude', 'Order Item Quantity', 'Product Name', 'Product Category Id', 'Order Region', 'Order Status', 'Category Name', 'Department Name', 'order_time', 'Longitude', 'Order Id', 'shipping_date', 'order_hour', 'Order Country', 'Customer Id', 'shipping_time', 'order_month']


In [17]:
# Return selected daily data
df_selected = daily_scaled_X[selected_features]
df_selected["Sales"] = y

In [18]:
# Sales' is the target variable and the rest are features
X = df_selected.drop("Sales", axis=1)
y = df_selected["Sales"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train XGBoost Regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"XGBoost Performance:")
print(f"  Mean Squared Error: {mse:.4f}")
print(f"  Root Mean Squared Error: {rmse:.4f}")
print(f"  Mean Absolute Error: {mae:.4f}")
print(f"  R-squared: {r2:.4f}")

XGBoost Performance:
  Mean Squared Error: 809197.8183
  Root Mean Squared Error: 899.5542
  Mean Absolute Error: 591.5325
  R-squared: 0.9848


In [19]:
!pip install catboost

from catboost import CatBoostRegressor

# Initialize and train CatBoost Regressor
catboost_model = CatBoostRegressor(iterations=100, learning_rate=0.1, depth=6, random_state=42, verbose=0)
catboost_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_catboost = catboost_model.predict(X_test)
mse_catboost = mean_squared_error(y_test, y_pred_catboost)
rmse_catboost = np.sqrt(mse_catboost)
mae_catboost = mean_absolute_error(y_test, y_pred_catboost)
r2_catboost = r2_score(y_test, y_pred_catboost)

print(f"\nCatBoost Performance:")
print(f"  Mean Squared Error: {mse_catboost:.4f}")
print(f"  Root Mean Squared Error: {rmse_catboost:.4f}")
print(f"  Mean Absolute Error: {mae_catboost:.4f}")
print(f"  R-squared: {r2_catboost:.4f}")


Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8

CatBoost Performance:
  Mean Squared Error: 928198.7539
  Root Mean Squared Error: 963.4307
  Mean Absolute Error: 695.2706
  R-squared: 0.9825
