In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from fpdf import FPDF

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [7]:
print(products.columns)
print(transactions.columns)

Index(['ProductID', 'ProductName', 'Category', 'Price'], dtype='object')
Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price'],
      dtype='object')


In [8]:


# Convert date columns to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Merge datasets and differentiate Price columns
merged_data = transactions.merge(customers, on='CustomerID')\
                          .merge(products, on='ProductID', suffixes=('_trans', '_prod'))

# Rename the 'Price' columns to avoid conflict
merged_data.rename(columns={'Price_prod': 'ProductPrice', 'Price_trans': 'TransactionPrice'}, inplace=True)

# Total sales by region (Total Sales Insight)
region_sales = merged_data.groupby('Region')['TotalValue'].sum().sort_values(ascending=False)
plt.figure(figsize=(10, 5))
sns.barplot(x=region_sales.index, y=region_sales.values, palette='viridis')
plt.xlabel("Region")
plt.ylabel("Total Sales (USD)")
plt.title("Total Sales by Region")
region_sales_img = 'region_sales.png'
plt.savefig(region_sales_img)  # Save plot as image
plt.close()

# Top-Selling Products (Insight 1: Product Preference)
top_products = merged_data.groupby('ProductName')['TotalValue'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(10, 5))
sns.barplot(x=top_products.index, y=top_products.values, palette='coolwarm')
plt.xlabel("Product")
plt.ylabel("Total Sales (USD)")
plt.title("Top-Selling Products")
plt.xticks(rotation=45)
top_products_img = 'top_products.png'
plt.savefig(top_products_img)  # Save plot as image
plt.close()

# Seasonal Trends (Insight 2: Seasonal Demand)
monthly_sales = merged_data.groupby(merged_data['TransactionDate'].dt.to_period('M'))['TotalValue'].sum()
plt.figure(figsize=(10, 5))
monthly_sales.plot(kind='line', color='purple')
plt.xlabel("Month")
plt.ylabel("Total Sales (USD)")
plt.title("Seasonal Sales Trends")
plt.xticks(rotation=45)
seasonal_trends_img = 'seasonal_trends.png'
plt.savefig(seasonal_trends_img)  # Save plot as image
plt.close()

# Customer Loyalty (Insight 4: Loyalty Program)
customer_sales = merged_data.groupby('CustomerID')['TotalValue'].sum().sort_values(ascending=False)
plt.figure(figsize=(10, 5))
sns.histplot(customer_sales, kde=True, color='blue')
plt.xlabel("Total Sales per Customer (USD)")
plt.ylabel("Number of Customers")
plt.title("Customer Loyalty Distribution")
customer_loyalty_img = 'customer_loyalty.png'
plt.savefig(customer_loyalty_img)  # Save plot as image
plt.close()

# Pricing Strategy (Insight 5: Premium Products)
# Use 'ProductPrice' column after renaming
product_prices = merged_data.groupby('ProductName')['ProductPrice'].mean()  # Use 'ProductPrice' from the merged dataset
product_revenue = merged_data.groupby('ProductName')['TotalValue'].sum()

plt.figure(figsize=(10, 5))
sns.scatterplot(x=product_prices, y=product_revenue, color='green')
plt.xlabel("Product Price (USD)")
plt.ylabel("Total Revenue (USD)")
plt.title("Pricing Strategy: Price vs Revenue")
pricing_strategy_img = 'pricing_strategy.png'
plt.savefig(pricing_strategy_img)  # Save plot as image
plt.close()

# Insights
business_insights = [
    "1. **Top-Selling Products**: A few products dominate sales, indicating high customer preference.",
    "2. **Seasonal Trends**: Sales peak during specific months, suggesting seasonal demand.",
    "3. **Regional Performance**: Regions like 'North' and 'West' contribute the most to revenue, highlighting potential for targeted marketing.",
    "4. **Customer Loyalty**: A small group of customers makes frequent purchases, emphasizing the need for loyalty programs.",
    "5. **Pricing Strategy**: Higher-priced products drive significant revenue, indicating a focus on premium offerings.",
    "6. **Total Sales by Region**: The 'North' region leads with the highest sales, followed by 'West'. This suggests regional preferences or higher purchasing power in these areas."
]

# Create PDF
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)

# Add title
pdf.cell(200, 10, txt="Exploratory Data Analysis Report", ln=True, align="C")

# Add insights
pdf.set_font("Arial", size=10)
for insight in business_insights:
    pdf.multi_cell(0, 10, txt=insight)

# Add diagrams to PDF in order of insights
pdf.add_page()
pdf.image(region_sales_img, x=10, y=10, w=180)
pdf.add_page()
pdf.image(top_products_img, x=10, y=10, w=180)
pdf.add_page()
pdf.image(seasonal_trends_img, x=10, y=10, w=180)
pdf.add_page()
pdf.image(customer_loyalty_img, x=10, y=10, w=180)
pdf.add_page()
pdf.image(pricing_strategy_img, x=10, y=10, w=180)

# Save PDF
pdf.output("Ayushman_Saxena_EDA.pdf")

print("PDF report generated successfully!")


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=region_sales.index, y=region_sales.values, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_products.index, y=top_products.values, palette='coolwarm')


PDF report generated successfully!
