In [74]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

---

## 📊 Visualization Plan

---

### 1. Overview & Distribution
*Understand data range, distribution, and outliers.*

- **1.1 Histogram of `SellPrice` (with KDE)**  
  → Understand price distribution and skewness.

- **1.2 Boxplot of `SellPrice` (Raw + Capped)**  
  → Spot outliers and how extreme they are.

- **1.3 Histogram of Customer `Age`**  
  → See dominant age groups buying phones.

---

### 2. Customer Demographics
*Reveal who your customers are.*

- **2.1 Barplot: Customer Count by Location (Local vs Non-Local)**  
  → Where are your buyers from?

- **2.2 Barplot: Customer Count by Gender (Male vs Female)**  
  → Any gender skew?

- **2.3 Age vs Sell Price (Scatter or Strip Plot)**  
  → Any age-price buying pattern?

---

### 3. Sales by Product
*See how mobile models differ in price and popularity.*

- **3.1 Barplot: Count of Each Mobile Model Sold**  
  → What’s selling the most?

- **3.2 Barplot: Average Sell Price per Mobile Model**  
  → Premium vs budget product range?

---

### 4. Marketing Influence
*Understand if marketing sources impact sales.*

- **4.1 Barplot: Sell Price by `FromFacebookPage`**  
  → Did FB-marketed buyers spend more?

- **4.2 Barplot: Sell Price by `Follower` or `PreviousPurchase`**  
  → Do loyal followers or repeat customers spend more?

---

### 5. Correlation & Interactions
*Understand deeper patterns.*

- **5.1 Correlation Heatmap**  
  → Which features move together?

- **5.2 Pairplot (Optional, filtered on key features)**  
  → Visualize interactions between `Age`, `Price`, `Location`, and `Loyalty`.

---

### 6. Bonus: Time Trends *(if date data is available)*

- **6.1 Lineplot: Sell Price Over Time**

- **6.2 Barplot: Count of Sales Per Day**


---

In [91]:
df = pd.read_csv('./data/processed/TechCorner_Sales_Outliers_Removed_Encoded.csv')     # Load the encoded dataset
df = df.drop(columns='Unnamed: 0')

---

In [None]:
# Local Vs Non-Local Buyers
labels = ['Local', 'Non-Local']
values = [len(df[df['Non_Local'] == 0]), len(df[df['Non_Local'] == 1])]

fig = px.pie(values=values, title='Local Vs Non-Local Buyers', names=labels)
fig.show()

# Findings
# 66% are local buyers

---

In [None]:
#---Check for Sell Price Outliers

fig = px.box(df, x='SellPrice', title='Boxplot of Sell Prices')
fig.show()

Q1 = df['SellPrice'].quantile(0.25)
Q3 = df['SellPrice'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers_sell_price = df[(df['SellPrice'] < lower_bound) | (df['SellPrice'] > upper_bound)]
print(f"Number of outliers in 'SellPrice' column: {len(outliers_sell_price)}")
print(f"Outliers in 'SellPrice' column:\n{outliers_sell_price[['Date', 'Age', 'SellPrice']]}\n")

outliers_percentage = len(outliers_sell_price) / df.shape[0] * 100
print(f"Percentage of outliers in the dataset: {outliers_percentage:.2f}%\n")


Number of outliers in 'SellPrice' column: 180
Outliers in 'SellPrice' column:
            Date  Age  SellPrice
2     2024-01-06   42    36106.0
37    2024-01-08   45    36670.0
104   2024-01-10   24    36572.0
188   2024-02-06   30    36180.0
226   2024-02-08   27    36480.0
...          ...  ...        ...
7690  2025-09-03   50    36256.0
7721  2025-10-01   35    36500.0
7735  2025-10-02   39    36496.0
7790  2025-11-01   46    36383.0
7804  2025-11-02   41    36784.0

[180 rows x 3 columns]

Percentage of outliers in the dataset: 2.26%



---

In [75]:
# Sell Price Vs Capped Sell Price Distribution

df['SellPrice_Capped'] = df['SellPrice'].clip(lower=lower_bound, upper=upper_bound)         # Cap the outliers in 'Sell Price' column

labels = ['Sell Price', 'Frequency']

# plt.figure(figsize=(10, 6))

# sns.histplot(df['SellPrice'], kde=True, bins=30, color='skyblue', edgecolor='black')

# sns.histplot(df['SellPrice_Capped'], kde=True, bins=30, color='orange', edgecolor='black')

# plt.title('Distribution of Sell Prices (Original vs Capped)')
# plt.xlabel('Sell Price')
# plt.ylabel('Frequency')
# plt.grid(axis='y', linestyle='--', alpha=0.7)
# plt.tight_layout()
# plt.show()


# Create the figure
fig = go.Figure()

# Add original SellPrice histogram
fig.add_trace(go.Histogram(
    x=df['SellPrice'],
    name='Original',
    nbinsx=30,
    marker_color='skyblue',
    marker_line_color='black',
    marker_line_width=1,
    opacity=0.75,
    histnorm='probability density'  # For KDE compatibility
))

# Add Capped SellPrice histogram
fig.add_trace(go.Histogram(
    x=df['SellPrice_Capped'],
    name='Capped',
    nbinsx=30,
    marker_color='orange',
    marker_line_color='black',
    marker_line_width=1,
    opacity=0.75,
    histnorm='probability density'  # For KDE compatibility
))

# Add KDE curves
# For original prices
hist1, bin_edges1 = np.histogram(df['SellPrice'], bins=30, density=True)
kde1 = ff.create_distplot([df['SellPrice']], ['Original'], show_hist=False, show_rug=False)
fig.add_trace(go.Scatter(
    x=kde1['data'][0]['x'],
    y=kde1['data'][0]['y'],
    mode='lines',
    line=dict(color='skyblue', width=2),
    name='Original KDE',
    showlegend=True
))

# For capped prices
hist2, bin_edges2 = np.histogram(df['SellPrice_Capped'], bins=30, density=True)
kde2 = ff.create_distplot([df['SellPrice_Capped']], ['Capped'], show_hist=False, show_rug=False)
fig.add_trace(go.Scatter(
    x=kde2['data'][0]['x'],
    y=kde2['data'][0]['y'],
    mode='lines',
    line=dict(color='orange', width=2),
    name='Capped KDE',
    showlegend=True
))

# Update layout
fig.update_layout(
    title='Distribution of Sell Prices (Original vs Capped)',
    xaxis_title='Sell Price',
    yaxis_title='Density',
    bargap=0.1,  # Gap between bars
    plot_bgcolor='white',
    barmode='overlay'
)

# Add grid
fig.update_layout(
    yaxis=dict(
        showgrid=True,
        gridcolor='lightgray',
        gridwidth=1,
        griddash='dot'
    )
)

fig.show()



---

In [76]:
# Sell Price (Capped) Over Time

price_trend = df.groupby('Date')['SellPrice_Capped'].mean()


# plt.figure(figsize=(12, 4))
# price_trend.plot(marker='o', linestyle='-', color='mediumseagreen')
# plt.title('Average Sell Price Over Time')
# plt.xlabel('Date')
# plt.ylabel('Average Sell Price')
# plt.grid(True, linestyle='--', alpha=0.6)
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()


# Create the figure
fig = go.Figure()

# Add the line plot
fig.add_trace(go.Scatter(
    x=price_trend.index,
    y=price_trend.values,
    mode='lines+markers',  # Both line and markers
    line=dict(color='mediumseagreen', width=2),
    marker=dict(size=6),
    name='Average Sell Price'
))

# Update layout
fig.update_layout(
    title='Average Sell Price Over Time',
    xaxis_title='Date',
    yaxis_title='Average Sell Price',
    plot_bgcolor='white',
    hovermode='x unified'
)

# Customize grid and ticks
fig.update_xaxes(
    showgrid=True,
    gridcolor='lightgray',
    gridwidth=1,
    griddash='dot',
    tickangle=-45  # Rotate x-axis labels
)

fig.update_yaxes(
    showgrid=True,
    gridcolor='lightgray',
    gridwidth=1,
    griddash='dot'
)

# Adjust margins for better display
fig.update_layout(
    margin=dict(l=20, r=20, t=40, b=80),
    autosize=True,
    width=900,  # Similar to figsize=(12, 4) in matplotlib
    height=400
)

fig.show()

---

In [80]:
# Average Sell Price by Location
avg_price_by_location = df.groupby('Non_Local')['SellPrice_Capped'].mean().rename({0: 'Local', 1: 'Non-Local'})

# plt.figure(figsize=(6, 4))
# sns.barplot(x=avg_price_by_location.index, y=avg_price_by_location.values,legend=False, hue=avg_price_by_location.index,  palette='pastel')
# plt.title('Avg Sell Price: Local vs Non-Local')
# plt.ylabel('Avg Sell Price')
# plt.xlabel('Customer Type')
# plt.grid(axis='y', linestyle='--', alpha=0.6)
# plt.tight_layout()
# plt.show()

# Create the figure
fig = go.Figure()

# Add the bar plot
fig.add_trace(go.Bar(
    x=avg_price_by_location.index,
    y=avg_price_by_location.values,
    marker_color=['lightblue', 'lightcoral'],  # Pastel colors
    marker_line_color='black',
    marker_line_width=1,
    opacity=0.8,
    name=''
))

# Update layout
fig.update_layout(
    title='Avg Sell Price: Local vs Non-Local',
    xaxis_title='Customer Type',
    yaxis_title='Avg Sell Price',
    plot_bgcolor='white',
    showlegend=False,  # No legend as in original
    bargap=0.3  # Adjust spacing between bars
)

# Customize grid
fig.update_yaxes(
    showgrid=True,
    gridcolor='lightgray',
    gridwidth=1,
    griddash='dot'
)

# Adjust size (similar to figsize=(6,4))
fig.update_layout(
    width=600,
    height=400,
    margin=dict(l=20, r=20, t=40, b=20)
)

fig.show()

---

In [81]:
# New vs Returning Customers
# plt.figure(figsize=(5, 4))
# sns.countplot(data=df, x='PreviousPurchase',hue='PreviousPurchase', legend = False, palette='Set2')
# plt.title('New vs Returning Customers')
# plt.xticks([0, 1], ['New', 'Returning'])
# plt.ylabel('Count')
# plt.grid(axis='y', linestyle='--', alpha=0.6)
# plt.tight_layout()
# plt.show()

# Prepare the data
count_data = df['PreviousPurchase'].value_counts().sort_index()
categories = ['New', 'Returning']  # Matching your xticks labels

# Create the figure
fig = go.Figure()

# Add the bar plot
fig.add_trace(go.Bar(
    x=categories,
    y=count_data.values,
    marker_color=['#66c2a5', '#fc8d62'],  # Default Set2 palette colors
    marker_line_color='black',
    marker_line_width=1,
    opacity=0.8,
    name=''
))

# Update layout
fig.update_layout(
    title='New vs Returning Customers',
    xaxis_title='',
    yaxis_title='Count',
    plot_bgcolor='white',
    showlegend=False,
    bargap=0.4
)

# Customize grid
fig.update_yaxes(
    showgrid=True,
    gridcolor='lightgray',
    gridwidth=1,
    griddash='dot'
)

# Adjust size (similar to figsize=(5,4))
fig.update_layout(
    width=500,
    height=400,
    margin=dict(l=20, r=20, t=40, b=20)
)

fig.show()


---

In [82]:
# Facebook vs Walk-ins
# plt.figure(figsize=(5, 4))
# sns.countplot(data=df, x='FromFacebookPage',hue='FromFacebookPage', legend=False, palette='Set1')
# plt.title('Facebook vs Walk-in Customers')
# plt.xticks([0, 1], ['Walk-in', 'From Facebook'])
# plt.ylabel('Customer Count')
# plt.grid(axis='y', linestyle='--', alpha=0.6)
# plt.tight_layout()
# plt.show()

# Prepare the data
count_data = df['FromFacebookPage'].value_counts().sort_index()
categories = ['Walk-in', 'From Facebook']  # Matching your xticks labels

# Create the figure
fig = go.Figure()

# Add the bar plot (using Set1 palette colors)
fig.add_trace(go.Bar(
    x=categories,
    y=count_data.values,
    marker_color=['#e41a1c', '#377eb8'],  # First two colors from Set1 palette
    marker_line_color='black',
    marker_line_width=1,
    opacity=0.8,
    name=''
))

# Update layout
fig.update_layout(
    title='Facebook vs Walk-in Customers',
    xaxis_title='',
    yaxis_title='Customer Count',
    plot_bgcolor='white',
    showlegend=False,
    bargap=0.4
)

# Customize grid
fig.update_yaxes(
    showgrid=True,
    gridcolor='lightgray',
    gridwidth=1,
    griddash='dot'
)

# Adjust size (similar to figsize=(5,4))
fig.update_layout(
    width=500,
    height=400,
    margin=dict(l=20, r=20, t=40, b=20)
)

fig.show()


---

In [89]:
# Top 10 Best-Selling Mobile Models
mobile_counts = df.drop(columns=['Date', 'Age', 'SellPrice', 'SellPrice_Capped', 
                            'Non_Local', 'Male', 'FromFacebookPage', 
                            'Follower', 'PreviousPurchase', 'HeardOfShopBefore'])
top_10 = mobile_counts.sum().sort_values(ascending=False).head(10)
# plt.figure(figsize=(10, 4))
# sns.barplot(x=top_10.index, y=top_10.values, hue=top_10.index, legend=False, palette='coolwarm')
# plt.title('Top 10 Best-Selling Mobile Models')
# plt.ylabel('Number Sold')
# plt.xticks(rotation=45, ha='right')
# plt.tight_layout()
# plt.show()


# Create a coolwarm color gradient
# Exact seaborn coolwarm palette colors (10 colors)
seaborn_coolwarm = [
    '#b40426', '#d83023', '#f16220', '#fc913a', '#ffc352',  # Warm colors
    '#c8d7a9', '#93ca76', '#5cb85c', '#3a923a', '#1f601f'   # Cool colors
]

# Create the figure
fig = go.Figure()

# Add the bar plot with exact coolwarm colors
fig.add_trace(go.Bar(
    x=top_10.index,
    y=top_10.values,
    marker_color=seaborn_coolwarm[:len(top_10)],  # Use appropriate number of colors
    marker_line_color='black',
    marker_line_width=1,
    opacity=0.8,
    name=''
))

# Update layout
fig.update_layout(
    title='Top 10 Best-Selling Mobile Models',
    xaxis_title='',
    yaxis_title='Number Sold',
    plot_bgcolor='white',
    showlegend=False,
    bargap=0.2
)

# Customize x-axis ticks
fig.update_xaxes(
    tickangle=45,
    tickfont=dict(size=10),
    tickmode='array',
    tickvals=list(range(len(top_10.index))),
    ticktext=top_10.index
)

# Add grid lines
fig.update_yaxes(
    showgrid=True,
    gridcolor='lightgray',
    gridwidth=1,
    griddash='dot'
)

# Adjust size and margins
fig.update_layout(
    width=800,
    height=400,
    margin=dict(l=20, r=20, t=60, b=120)  # Extra bottom margin for long model names
)

fig.show()


---

In [90]:
# Sell Price by Gender
# plt.figure(figsize=(6, 4))
# sns.boxplot(data=df, x='Male', y='SellPrice_Capped', hue='Male', legend=False,palette='muted')
# plt.title('Sell Price Distribution by Gender')
# plt.xticks([0, 1], ['Female', 'Male'])
# plt.ylabel('Sell Price')
# plt.grid(axis='y', linestyle='--', alpha=0.6)
# plt.tight_layout()
# plt.show()

# Create the figure
fig = go.Figure()

# Add Female boxplot (Male=0)
fig.add_trace(go.Box(
    y=df[df['Male']==0]['SellPrice_Capped'],
    name='Female',
    marker_color='#4878cf',  # muted blue
    line_color='#4878cf',
    boxpoints='outliers',
    jitter=0.3,
    pointpos=-1.8
))

# Add Male boxplot (Male=1)
fig.add_trace(go.Box(
    y=df[df['Male']==1]['SellPrice_Capped'],
    name='Male',
    marker_color='#d65f5f',  # muted red
    line_color='#d65f5f',
    boxpoints='outliers',
    jitter=0.3,
    pointpos=-1.8
))

# Update layout
fig.update_layout(
    title='Sell Price Distribution by Gender',
    xaxis_title='',
    yaxis_title='Sell Price',
    plot_bgcolor='white',
    showlegend=False,
    boxmode='group'  # groups boxes together
)

# Customize x-axis ticks
fig.update_xaxes(
    tickvals=[0, 1],
    ticktext=['Female', 'Male'],
    tickangle=0
)

# Add grid lines
fig.update_yaxes(
    showgrid=True,
    gridcolor='lightgray',
    gridwidth=1,
    griddash='dot'
)

# Adjust size (similar to figsize=(6,4))
fig.update_layout(
    width=600,
    height=400,
    margin=dict(l=20, r=20, t=60, b=20)
)

fig.show()