In [None]:
#import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error



In [None]:
# Load data
data = pd.read_csv(r"Sample - Superstore.csv", encoding='latin1')

In [None]:
data.head(5)

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

Data Preprocessing

In [None]:

# إزالة Outliers
data = data[(data['Profit'] > data['Profit'].quantile(0.01)) & 
            (data['Profit'] < data['Profit'].quantile(0.99))]

# Convert to datetime
data["Order Date"] = pd.to_datetime(data["Order Date"])
data["Ship Date"] = pd.to_datetime(data["Ship Date"])

# Extract features
data["Order_Year"] = data["Order Date"].dt.year
data["Order_Month"] = data["Order Date"].dt.month
data["Order_Day"] = data["Order Date"].dt.day
data["Order_DayOfWeek"] = data["Order Date"].dt.dayofweek
data["Order_Quarter"] = data["Order Date"].dt.quarter
data["Shipping_Days"] = (data["Ship Date"] - data["Order Date"]).dt.days

# Cyclical encoding
data["Month_sin"] = np.sin(2 * np.pi * data["Order_Month"] / 12)
data["Month_cos"] = np.cos(2 * np.pi * data["Order_Month"] / 12)
data["DayOfWeek_sin"] = np.sin(2 * np.pi * data["Order_DayOfWeek"] / 7)
data["DayOfWeek_cos"] = np.cos(2 * np.pi * data["Order_DayOfWeek"] / 7)

# Drop unused columns
data = data.drop(['Row ID', 'Order ID', 'Customer ID', 'Customer Name', 
                  'Postal Code', 'Product ID'], axis=1)

# --- Encoding ---
# One-hot for small categories
data = pd.get_dummies(data, columns=["Category", "Sub-Category", 
                                     "Country", "Region", 
                                     "Ship Mode", "Segment"], dtype=int)

# Target Encoding for high cardinality features
for col in ["State", "City", "Product Name"]:
    mean_profit = data.groupby(col)["Profit"].mean()
    data[col] = data[col].map(mean_profit)

# Split data
y = data["Profit"]
X = data.drop(["Profit", "Order Date", "Ship Date"], axis=1)

# Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Models and Results

In [None]:
# Models
models = {
    "RandomForestRegressor": RandomForestRegressor(random_state=42),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=42),
    "LinearRegression": LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=42),
    "KNeighborsRegressor": KNeighborsRegressor()
}

results = []

# Training + Evaluation + Visualization
for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)

    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    mae = mean_absolute_error(y_test, y_pred_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

    results.append({
        "Model": name,
        "Train R2": train_r2,
        "Test R2": test_r2,
        "MAE": mae,
        "RMSE": rmse
    })

    # Actual vs Predicted plot
    plt.figure(figsize=(6, 4))
    plt.scatter(y_test, y_pred_test, alpha=0.5)
    plt.xlabel("Actual Profit")
    plt.ylabel("Predicted Profit")
    plt.title(f"{name} - Actual vs Predicted")
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.show()

# Results table
results_data = pd.DataFrame(results)
print(results_data.sort_values(by="Test R2", ascending=False))

In [None]:
import joblib
best_model=RandomForestRegressor()
joblib.dump(best_model, "super_store_model.pkl")
joblib.dump(scaler, "scaler.pkl")

