In [None]:
# ðŸ›’ Superstore Sales Analysis
### Exploratory Data Analysis (EDA)

This notebook performs a complete exploratory analysis on the Superstore Sales dataset, including:
- Data cleaning  
- Handling missing values  
- Time series analysis  
- Top-selling products  
- Regional performance  
- Sales vs Profit insights 

In [None]:
### ðŸ“Œ Step 1: Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

plt.style.use("seaborn")

In [None]:
### ðŸ“Œ Step 2: Load Dataset

In [None]:
sales_df = pd.read_csv('Superstore.csv', encoding='latin1')
sales_df.head()

In [None]:
### ðŸ“Œ Step 3: Check dataset info & shape

In [None]:
sales_df.shape
sales_df.info()


In [None]:
### ðŸ“Œ Step 4: Convert Date Columns

In [None]:
sales_df['Order Date'] = pd.to_datetime(sales_df['Order Date'])
sales_df['Ship Date'] = pd.to_datetime(sales_df['Ship Date'])


In [None]:
### ðŸ“Œ Step 5: Missing Values Function

In [None]:
def missing_values(df):
    missing_col = []
    for col in df.columns:
        missing = df[col].isna().sum()
        if missing != 0:
            missing_col.append(col)
            print(f"{col} has {missing} missing values")
    return missing_col

missing_values(sales_df)


In [None]:
### ðŸ“Œ Step 6: Filling Missing Row IDs

In [None]:
missing = sales_df['Row ID'].isna()
missing_sum = sales_df['Row ID'].isna().sum()

sales_df.loc[missing,'Row ID'] = np.arange(
    sales_df['Row ID'].max()+1,
    sales_df['Row ID'].max()+1+missing_sum
)


In [None]:
### ðŸ“Œ Step 7: Removing Duplicates

In [None]:
duplicates = sales_df[sales_df.duplicated()]
print("Duplicate rows:", len(duplicates))

if len(duplicates) != 0:
    sales_df = sales_df.drop_duplicates()


In [None]:
### Exploratory Data Analysis (EDA)

In [None]:
### ðŸ“Œ Step 8: Set Date Index & Resampling

In [None]:
sales_df = sales_df.set_index('Order Date')

daily_sales = sales_df['Sales'].resample('D').sum()
weekly_sales = sales_df['Sales'].resample('W').sum()
monthly_sales = sales_df['Sales'].resample('ME').sum()
yearly_sales = sales_df['Sales'].resample('YE').sum()


In [None]:
### ðŸ“Œ Step 9: Plot Yearly Sales

In [None]:
plt.figure(figsize=(10,5))
plt.plot(yearly_sales.index, yearly_sales.values, marker='o', linewidth=2)
plt.title('Sales Per Year')
plt.xlabel('Year')
plt.ylabel('Sales')
plt.grid(True)
plt.show()


In [None]:
### ðŸ“Œ Step 10: Top Selling Products and Plotting it

In [None]:
sales = sales_df.groupby('Product Name')['Sales'].sum().sort_values(ascending=False)
top_selling = sales.head(10)
top_selling

top_selling.plot(kind='barh', figsize=(10,6))
plt.xlabel('Total Sales')
plt.title('Top 10 Selling Products')
plt.gca().invert_yaxis()
plt.show()


In [None]:
### ðŸ“Œ Step 11: Regional Sales Comparison

In [None]:
regional_sales = sales_df.groupby('Region')['Sales'].sum().sort_values()

plt.figure(figsize=(8,5))
plt.bar(regional_sales.index, regional_sales.values)
plt.title("Regional Sales Comparison")
plt.ylabel("Total Sales")
plt.grid(axis='y')
plt.show()

In [None]:

### ðŸ“Œ Step 12: Heatmap (Region Ã— Category)

In [None]:
pivot_table = sales_df.pivot_table(values='Sales', index='Region', columns='Category', aggfunc='sum')
sns.heatmap(pivot_table, annot=True, cmap='YlGnBu')
plt.title('Sales by Region and Category')
plt.show()


In [None]:
### ðŸ“Œ Step 13: Profit vs Sales Scatter Plot

In [None]:
plt.scatter(sales_df['Sales'], sales_df['Profit'], alpha=0.3, s=10)
plt.xlabel('Sales')
plt.ylabel('Profit')
plt.title('Profit vs Sales')
plt.show()


In [None]:
### ðŸ“Œ Step 14: Market Share Pie Chart 

In [None]:
category_sales = sales_df.groupby('Category')['Sales'].sum()
share = (category_sales / category_sales.sum()) * 100

plt.pie(share.values, labels=share.index, autopct='%.2f%%')
plt.title("Market Share by Category")
plt.show()


In [None]:
# ðŸ“ˆ Key Insights

- Technology category generated the highest total sales.
- West region contributed the highest revenue overall.
- Some products have high sales but low profit margins.
- Clear seasonal patterns are visible in sales trends.
- Regional performance differs widely â€” important for business decisions.
