In [None]:
#import

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor



from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


In [None]:
#Load the data
data=pd.read_csv(r"Sample - Superstore.csv",encoding='latin1')

In [None]:
data.dtypes

In [None]:
plt.figure(figsize=(6,4))
data.groupby('Category')['Sales'].sum().plot(kind='bar')
plt.title('Total Sales by Category')
plt.xlabel('Category')
plt.ylabel('Sales($)')
plt.show()


In [None]:
# إزالة Outliers:
data = data[(data['Profit'] > data['Profit'].quantile(0.01)) & 
            (data['Profit'] < data['Profit'].quantile(0.99))]


In [None]:
plt.figure(figsize=(6,6))
data.groupby('Region')['Sales'].sum().plot(kind="pie", autopct='%1.1f%%')
plt.title('Total Sales by Region')
plt.ylabel('')
plt.show()

In [None]:
data['Order Date'] = pd.to_datetime(data['Order Date'])
monthly_profit = data.groupby(data['Order Date'].dt.to_period('M'))['Profit'].sum()
monthly_profit.index = monthly_profit.index.to_timestamp()

plt.figure(figsize=(10,5))
monthly_profit.plot()
plt.title('Monthly Profit Over Time')
plt.xlabel('Month')
plt.ylabel('Profit')
plt.grid(True)
plt.show()

In [None]:
pivot_table = pd.pivot_table(data, values='Profit', index='Category', columns='Region', aggfunc='sum')

sns.heatmap(pivot_table, annot=True, fmt='.0f', cmap='YlGnBu')
plt.title('Profit by Category and Region')
plt.show()


In [None]:
#EDA
data.info()

In [None]:
print(data.isnull().sum())

In [None]:
#Data Cleanning
data.columns

In [None]:

data = data.drop(['Row ID', 'Order ID', 
                  'Customer ID', 'Customer Name',
                  'Postal Code', 'Product ID'], axis=1)

In [None]:
data

In [None]:

data = pd.get_dummies(data, columns=["Category", "Sub-Category", 'Country', 
                                     'Region', 'State', 'City', 'Ship Mode', 'Segment' , "Product Name"], 
                      dtype=int)

data.head(5)


In [None]:

# Convert to datetime
data["Order Date"] = pd.to_datetime(data["Order Date"])
data["Ship Date"] = pd.to_datetime(data["Ship Date"])

# Extract features
data["Order_Year"] = data["Order Date"].dt.year
data["Order_Month"] = data["Order Date"].dt.month
data["Order_Day"] = data["Order Date"].dt.day
data["Order_DayOfWeek"] = data["Order Date"].dt.dayofweek  # 0 = Monday
data["Order_Quarter"] = data["Order Date"].dt.quarter

# Shipping duration
data["Shipping_Days"] = (data["Ship Date"] - data["Order Date"]).dt.days

# Cyclical encoding for Month & DayOfWeek
data["Month_sin"] = np.sin(2 * np.pi * data["Order_Month"] / 12)
data["Month_cos"] = np.cos(2 * np.pi * data["Order_Month"] / 12)

data["DayOfWeek_sin"] = np.sin(2 * np.pi * data["Order_DayOfWeek"] / 7)
data["DayOfWeek_cos"] = np.cos(2 * np.pi * data["Order_DayOfWeek"] / 7)



In [None]:
data.head()

In [None]:
y=data["Profit"]
x=data.drop(["Profit"] , axis=1)



In [None]:

scaler = StandardScaler()
x = scaler.fit_transform(x)

x_train , x_test , y_train , y_test =train_test_split(x ,y , test_size=0.2,random_state=42 )
print(x_train.shape , y_train.shape) , x_test , y_test


models = {
    "RandomForestRegressor": RandomForestRegressor(random_state=42),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=42),
    "LinearRegression": LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=42),
    "KNeighborsRegressor": KNeighborsRegressor(),
   
   
}


results = []

for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)

    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    mae = mean_absolute_error(y_test, y_pred_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

    results.append({
        "Model": name,
        "Train R2": train_r2,
        "Test R2": test_r2,
        "MAE": mae,
        "RMSE": rmse
    })
   
    

results_data = pd.DataFrame(results)
print(results_data.sort_values(by="Test R2", ascending=False))