In [238]:
import numpy as np 
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE, r2_score, mean_absolute_error as MAE

from sklearn.preprocessing import LabelEncoder, MinMaxScaler 
from sklearn.ensemble import RandomForestRegressor

import seaborn as sns 
import plotly.express as px 
from plotly.subplots import make_subplots
import plotly.graph_objects as go


In [239]:
df=pd.read_csv(r'./Electronic.csv')
df

Unnamed: 0,Age,Items Purchased,Total Spent,Discount (%),Satisfaction Score,Warranty Extension,Gender,Region,Product Category,Payment Method,Revenue,Store Rating,Loyalty Score,Membership Status,Preferred Visit Time
0,56,1,29.226195,47.077380,1.000000,1,Male,South,Accessories,UPI,149.252145,3.660461,3.597133,1,Evening
1,69,10,420.142612,7.985739,3.760294,1,Female,South,Accessories,Cash,1485.524222,3.551553,25.764903,1,Evening
2,46,4,127.742817,37.225718,1.771240,1,Male,East,Laptop,Credit Card,85.550131,3.922839,7.022399,1,Morning
3,32,9,417.722683,8.227732,1.926831,0,Female,East,Tablet,UPI,824.118724,3.860422,7.635412,1,Afternoon
4,60,13,608.031366,5.000000,3.902927,0,Female,South,Tablet,UPI,2463.590392,3.812820,29.461119,0,Morning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,24,1,39.583865,46.041614,1.000000,0,Male,West,Television,Credit Card,-64.457793,4.187345,9.136240,0,Morning
4996,66,5,172.486538,32.751346,1.000000,0,Male,East,Accessories,Debit Card,138.772917,3.868131,1.961965,0,Evening
4997,26,11,493.388104,5.000000,2.324783,0,Male,North,Television,Net Banking,1010.219124,4.172183,16.750489,0,Evening
4998,53,10,260.269589,23.973041,5.000000,1,Female,West,Accessories,Debit Card,1114.585926,3.942221,25.562475,1,Afternoon


<h1 style="font-size:40px; color:#71C9CE; text-shadow: 2px 2px 2px black; background-color:#E3FDFD; border-radius:20px; padding:5px; text-align:center; font-weight:bolder;"> Data Handel</h1>

In [240]:
duplicated = df.duplicated().sum()
nulls = df.isnull().sum()
print("duplicated: ", duplicated)
print(f"NULLS:\n{nulls}", )

duplicated:  0
NULLS:
Age                     0
Items Purchased         0
Total Spent             0
Discount (%)            0
Satisfaction Score      0
Warranty Extension      0
Gender                  0
Region                  0
Product Category        0
Payment Method          0
Revenue                 0
Store Rating            0
Loyalty Score           0
Membership Status       0
Preferred Visit Time    0
dtype: int64


In [241]:
desc = df.describe().T  # transpose for better visualization

# Plot heatmap
fig = px.imshow(desc,
                text_auto=True,
                aspect="auto",
                title="Descriptive Statistics (df.describe()) Heatmap",
                labels=dict(color="Value"))
fig.show()

In [242]:
# Create a summary DataFrame
info_df = pd.DataFrame({
    'Column': df.columns,
    'Non-Null Count': df.notnull().sum(),
    'Data Type': df.dtypes.astype(str)
})

# Bar chart of non-null counts
fig = px.bar(info_df, x='Column', y='Non-Null Count', color='Data Type',
             title='Non-Null Counts and Data Types per Column',
             text='Data Type')
fig.update_layout(xaxis_tickangle=-45)
fig.show()

In [243]:
negative_columns = [
    col for col in df.select_dtypes(include='number').columns
    if (df[col] < 0).any()
]

# Display results
if negative_columns:
    print("Columns with negative values:")
    print(negative_columns)
else:
    print("No numeric columns contain negative values.")

Columns with negative values:
['Revenue']


In [244]:
df["Revenue"] = df["Revenue"].apply(lambda x: 0 if x < 0 else x)

In [245]:
df["Gender"] = df["Gender"].str.replace("Male", "ذكر").replace("Female", "أنثى")

<h1 style="font-size:40px; color:#71C9CE; text-shadow: 2px 2px 2px black; background-color:#E3FDFD; border-radius:20px; padding:5px; text-align:center; font-weight:bolder;"> EDA</h1>

In [246]:
# Select categorical-like columns with "low" cardinality
count_columns = [
    col for col in df.columns
    if df[col].nunique() <= (df.shape[0] / 10) and df[col].dtype == 'object'
]

for col in count_columns:
    value_counts = df[col].value_counts()
    if value_counts.empty:
        continue

    pie_data = value_counts.reset_index()
    pie_data.columns = [col, 'Count']
    
    fig = px.pie(pie_data, names=col, values='Count',
                 title=f'Distribution of {col}')
    fig.show()


In [247]:
# Define lighter color palettes
color_palettes = [
    px.colors.qualitative.Set1,
    px.colors.qualitative.Pastel,
    px.colors.qualitative.Vivid,
]

# Identify columns with low cardinality
count_columns = [
    col for col in df.columns
    if df[col].nunique() <= (df.shape[0] / 10) and df[col].dtype == 'object'
]

# Generate bar plots with lighter colors
for i, col in enumerate(count_columns):
    value_counts = df[col].value_counts()
    if value_counts.empty:
        continue

    bar_data = value_counts.reset_index()
    bar_data.columns = [col, 'Count']

    # Pick a lighter color palette (cycling through the list)
    color_sequence = color_palettes[i % len(color_palettes)]

    fig = px.bar(bar_data, x=col, y='Count',
                 title=f'Bar Plot of {col}',
                 text='Count',
                 color_discrete_sequence=color_sequence)

    fig.update_layout(xaxis_tickangle=-45)
    fig.show()

<h1 style="font-size:40px; color:#71C9CE; text-shadow: 2px 2px 2px black; background-color:#E3FDFD; border-radius:20px; padding:5px; text-align:center; font-weight:bolder;"> Categorical Conversion</h1>

In [248]:
categorical_cols = ['Gender', 'Region', 'Product Category', 'Payment Method', 'Preferred Visit Time']
LE = LabelEncoder()

for col in categorical_cols:
    df[col] = LE.fit_transform(df[col])

In [249]:
# Create a summary DataFrame
info_df = pd.DataFrame({
    'Column': df.columns,
    'Non-Null Count': df.notnull().sum(),
    'Data Type': df.dtypes.astype(str)
})

# Bar chart of non-null counts
fig = px.bar(info_df, x='Column', y='Non-Null Count', color='Data Type',
             title='Non-Null Counts and Data Types per Column',
             text='Data Type')
fig.update_layout(xaxis_tickangle=-45)
fig.show()

<h1 style="font-size:40px; color:#71C9CE; text-shadow: 2px 2px 2px black; background-color:#E3FDFD; border-radius:20px; padding:5px; text-align:center; font-weight:bolder;"> Remove Outliers</h1>

In [250]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Identify numeric columns
numeric_columns = df.select_dtypes(include='number').columns

# Create subplots with 3 columns per row
num_plots = len(numeric_columns)
rows = (num_plots // 3) + (num_plots % 3 > 0)  # Calculate required rows

# Create a subplot figure with a specific layout
fig = make_subplots(rows=rows, cols=3, subplot_titles=numeric_columns)

# Add box plots to each subplot
for i, col in enumerate(numeric_columns):
    row = i // 3 + 1  # Row position
    col_pos = i % 3 + 1  # Column position
    
    # Create the box plot
    box_fig = px.box(df, y=col)
    
    # Extract the box plot trace (box plot data)
    trace = box_fig.data[0]
    
    # Add the trace to the subplot
    fig.add_trace(trace, row=row, col=col_pos)

# Update layout for spacing and title
fig.update_layout(
    height=rows * 400,  # Adjust the height dynamically based on the number of rows
    title_text="Boxplots for Numeric Columns",
    showlegend=False
)

fig.show()


In [251]:
def remove_outliers_iqr(df, columns):
    df_clean = df.copy()

    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Keep only the rows within the bounds
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    return df_clean
outlier_cols = []
for col in df.columns:
    if len(df[col].value_counts().to_list()) > (df.shape[0] / 4): 
        outlier_cols.append(col)

grouped = remove_outliers_iqr(df, outlier_cols)

In [252]:
# Identify numeric columns
numeric_columns = df.select_dtypes(include='number').columns

# Create subplots with 3 columns per row
num_plots = len(numeric_columns)
rows = (num_plots // 3) + (num_plots % 3 > 0)  # Calculate required rows

# Create a subplot figure with a specific layout
fig = make_subplots(rows=rows, cols=3, subplot_titles=numeric_columns)

# Add box plots to each subplot
for i, col in enumerate(numeric_columns):
    row = i // 3 + 1  # Row position
    col_pos = i % 3 + 1  # Column position
    
    # Create the box plot
    box_fig = px.box(df, y=col)
    
    # Extract the box plot trace (box plot data)
    trace = box_fig.data[0]
    
    # Add the trace to the subplot
    fig.add_trace(trace, row=row, col=col_pos)

# Update layout for spacing and title
fig.update_layout(
    height=rows * 400,  # Adjust the height dynamically based on the number of rows
    title_text="Boxplots for Numeric Columns",
    showlegend=False
)

fig.show()


In [253]:
corr_matrix = df.corr(numeric_only=True)

# Plot the heatmap
fig = px.imshow(
    corr_matrix,
    text_auto='.2f',  # show values with 2 decimal points
    color_continuous_scale='RdBu_r',  # diverging color scale
    title='Correlation Heatmap'
)

fig.update_layout(width=800, height=800)
fig.show()

<h1 style="font-size:40px; color:#71C9CE; text-shadow: 2px 2px 2px black; background-color:#E3FDFD; border-radius:20px; padding:5px; text-align:center; font-weight:bolder;"> Data Normalization</h1>

In [254]:
scaler = MinMaxScaler()
dfsc = scaler.fit_transform(df)

# تحويل الناتج إلى DataFrame بنفس الأعمدة الأصلية
df = pd.DataFrame(dfsc, columns=df.columns)

<h1 style="font-size:40px; color:#71C9CE; text-shadow: 2px 2px 2px black; background-color:#E3FDFD; border-radius:20px; padding:5px; text-align:center; font-weight:bolder;">Data Splitting</h1>

In [255]:
# تحديد البيانات المستقلة X والمتغير الهدف y
X = df.drop(['Revenue'], axis=1)
y = df['Revenue']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<h1 style="font-size:40px; color:#71C9CE; text-shadow: 2px 2px 2px black; background-color:#E3FDFD; border-radius:20px; padding:5px; text-align:center; font-weight:bolder;">ML Model</h1>

In [256]:
# إنشاء نموذج Random Forest
rf_model = RandomForestRegressor(n_estimators=200, max_depth=150) 
rf_model.fit(X_train, y_train)

<h1 style="font-size:40px; color:#71C9CE; text-shadow: 2px 2px 2px black; background-color:#E3FDFD; border-radius:20px; padding:5px; text-align:center; font-weight:bolder;">Model Prediction & Evaluation</h1>

In [261]:
# التنبؤ والتقييم
y_pred = rf_model.predict(X_test)
mse = MSE(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = MAE(y_test, y_pred)

print(f"Random Forest MSE: {mse:.2f}")
print(f"Random Forest MAE: {mae:.2f}")
print(f"Random Forest  R²: {r2:.2f}")


Random Forest MSE: 0.00
Random Forest MAE: 0.02
Random Forest  R²: 0.98


In [258]:
df_results = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})

fig = px.scatter(
    df_results,
    x='Actual',
    y='Predicted',
    title='XGBoost Regression: Actual vs Predicted',
    labels={'Actual': 'Actual Values', 'Predicted': 'Predicted Values'},
    opacity=0.6,
    trendline='ols'
)

fig.update_traces(marker=dict(color='royalblue'))  # Change to any CSS color name or hex

fig.update_layout(
    width=800,
    height=600,
    template='plotly_white'
)

fig.show()

