In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load & Inspect Data
df = pd.read_csv("sales_.csv", encoding="ISO-8859-1")
df.info()
df.head()

# Data Cleaning
df['ORDERDATE'] = pd.to_datetime(df['ORDERDATE'], errors='coerce')

# Fill numeric missing values
numeric_cols = ['TEMPERATURE', 'TREND_SCORE', 'DISCOUNT_PERCENT']
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].mean())

# Fill categorical missing values
categorical_cols = ['PRODUCTLINE', 'CATEGORY', 'SUBCATEGORY', 'DEALSIZE',
                    'COUNTRY', 'CITY', 'WEATHER', 'SEASON',
                    'USER_AGE_GROUP', 'USER_GENDER', 'DEVICE_TYPE']

for col in categorical_cols:
    df[col] = df[col].fillna('Unknown')

# Label encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2823 entries, 0 to 2822
Data columns (total 37 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ORDERNUMBER       2823 non-null   int64  
 1   QUANTITYORDERED   2823 non-null   int64  
 2   PRICEEACH         2823 non-null   float64
 3   ORDERLINENUMBER   2823 non-null   int64  
 4   SALES             2823 non-null   float64
 5   ORDERDATE         2823 non-null   object 
 6   STATUS            2823 non-null   object 
 7   QTR_ID            2823 non-null   int64  
 8   MONTH_ID          2823 non-null   int64  
 9   YEAR_ID           2823 non-null   int64  
 10  PRODUCTLINE       2823 non-null   object 
 11  MSRP              2823 non-null   int64  
 12  PRODUCTCODE       2823 non-null   object 
 13  CUSTOMERNAME      2823 non-null   object 
 14  PHONE             2823 non-null   object 
 15  ADDRESSLINE1      2823 non-null   object 
 16  ADDRESSLINE2      302 non-null    object 


In [3]:
# Feature Engineering
df['MONTH'] = df['ORDERDATE'].dt.month
df['DAY_OF_WEEK'] = df['ORDERDATE'].dt.dayofweek
df['IS_WEEKEND'] = df['DAY_OF_WEEK'].isin([5,6]).astype(int)

# Optional: user behavior features (if user-level data available)
# df['USER_PURCHASE_FREQ'] = ...
# df['USER_RECENCY'] = ...

# Train/Test Split
feature_cols = ['PRODUCTLINE', 'CATEGORY', 'SUBCATEGORY', 'DEALSIZE',
                'COUNTRY', 'CITY', 'WEATHER', 'SEASON', 'TEMPERATURE',
                'TREND_SCORE', 'IS_EVENT_DAY', 'DISCOUNT_PERCENT',
                'USER_AGE_GROUP', 'USER_GENDER', 'DEVICE_TYPE',
                'MONTH', 'DAY_OF_WEEK', 'IS_WEEKEND']

X = df[feature_cols]
y = df['SALES']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
#  ML Pipeline
numeric_features = ['TEMPERATURE', 'TREND_SCORE', 'DISCOUNT_PERCENT','MONTH','DAY_OF_WEEK','IS_WEEKEND']
categorical_features = [col for col in feature_cols if col not in numeric_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=300, random_state=42))
])

# Train model
model_pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [5]:

# # # Evaluation
# y_pred = model_pipeline.predict(X_test)
# print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
# print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
# print(f"R² Score: {r2_score(y_test, y_pred):.2f}")

# # Feature Importance (Numeric Only Approximation)
# numeric_importances = model_pipeline.named_steps['regressor'].feature_importances_[:len(numeric_features)]
# plt.figure(figsize=(6,4))
# sns.barplot(x=numeric_importances, y=numeric_features, palette='viridis')
# plt.title("Approx. Feature Importance (Numeric Features)")
# plt.show()

# # Optional: SHAP for full feature importance
# explainer = shap.TreeExplainer(model_pipeline.named_steps['regressor'])
# # Sample 100 rows
# X_sample = X_test.sample(min(100, len(X_test)), random_state=42)
# # Create SHAP explainer on the whole pipeline
# explainer = shap.Explainer(model_pipeline.predict, X_sample)
# # Compute SHAP values
# shap_values = explainer(X_sample)
# # Plot
# shap.summary_plot(shap_values, X_sample)


In [6]:
# Predict New Scenario
new_data = pd.DataFrame({
    'PRODUCTLINE': [1],
    'CATEGORY': [2],
    'SUBCATEGORY': [2],
    'DEALSIZE': [1],
    'COUNTRY': [5],
    'CITY': [10],
    'WEATHER': [3],
    'SEASON': [2],
    'TEMPERATURE': [28],
    'TREND_SCORE': [0.75],
    'IS_EVENT_DAY': [1],
    'DISCOUNT_PERCENT': [20],
    'USER_AGE_GROUP': [2],
    'USER_GENDER': [1],
    'DEVICE_TYPE': [0],
    'MONTH':[7],
    'DAY_OF_WEEK':[2],
    'IS_WEEKEND':[0]
})
predicted_sales = model_pipeline.predict(new_data)
print(f"Predicted Sales: ${predicted_sales[0]:.2f}")

Predicted Sales: $4019.04


In [7]:
# ML-Based Recommendations
# Compute product feature vectors
product_features = df.groupby('PRODUCTLINE')[['CATEGORY','SUBCATEGORY','SEASON','WEATHER','TREND_SCORE','DISCOUNT_PERCENT']].mean()
similarity_matrix = cosine_similarity(product_features)
similarity_df = pd.DataFrame(similarity_matrix, index=product_features.index, columns=product_features.index)

def hybrid_recommendations(product_line, scenario_features, top_n=5):
    """Hybrid: similarity + predicted sales"""
    if product_line not in similarity_df.index:
        product_line = similarity_df.index[0]
    sim_scores = similarity_df[product_line]
    user_df = pd.DataFrame([scenario_features]*len(sim_scores))
    predicted_sales_all = model_pipeline.predict(user_df)
    combined_score = sim_scores * predicted_sales_all
    top_products = combined_score.sort_values(ascending=False).head(top_n).index.tolist()
    return top_products

female_code = label_encoders['USER_GENDER'].transform(['Female'])[0]
user_scenario = new_data.iloc[0].to_dict()
recommended_products = hybrid_recommendations(product_line=1, scenario_features=user_scenario, top_n=5)
print(f"Recommended Products: {recommended_products}")

Recommended Products: [1, 0, 2, 3, 5]


In [8]:
# Visualizations
# Monthly sales
monthly_sales = df.groupby('MONTH')['SALES'].sum().reset_index()
fig = px.line(monthly_sales, x='MONTH', y='SALES', title='Monthly Sales Trend', markers=True, line_shape='spline')
fig.show()

# Sales by Weather
weather_sales = df.groupby('WEATHER')['SALES'].sum().reset_index()
fig = px.bar(weather_sales, x='WEATHER', y='SALES', color='SALES', color_continuous_scale='Viridis', title='Sales by Weather')
fig.show()

# Top 10 cities
top_cities = df.groupby('CITY')['SALES'].sum().nlargest(10).reset_index()
fig = px.bar(top_cities, x='CITY', y='SALES', color='SALES', color_continuous_scale='Viridis', title='Top 10 Cities by Sales')
fig.show()

# Sales by Gender
gender_sales = df.groupby('USER_GENDER', as_index=False)['SALES'].sum()
gender_map = dict(zip(label_encoders['USER_GENDER'].transform(label_encoders['USER_GENDER'].classes_), 
                      label_encoders['USER_GENDER'].classes_))
gender_sales['USER_GENDER'] = gender_sales['USER_GENDER'].map(gender_map)
fig = px.bar(gender_sales, x='USER_GENDER', y='SALES', color='SALES', color_continuous_scale='Viridis', title='Sales by Gender')
fig.show()