# Import Required Libraries
Import necessary libraries such as pandas, numpy, matplotlib, and seaborn.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style for seaborn
sns.set(style="whitegrid")

# Load Datasets
Load the transactions, products, and customers datasets into pandas DataFrames.

In [None]:
# Load Datasets

# Load transactions dataset
transactions = pd.read_csv('transactions.csv')

# Load products dataset
products = pd.read_csv('products.csv')

# Load customers dataset
customers = pd.read_csv('customers.csv')

# Display the first few rows of each dataset to verify loading
print("Transactions Dataset:")
print(transactions.head())

print("\nProducts Dataset:")
print(products.head())

print("\nCustomers Dataset:")
print(customers.head())

# Data Cleaning
Handle missing values, correct data types, and remove duplicates if any.

In [None]:
# Data Cleaning

# Handle missing values
print("Missing values in transactions dataset:")
print(transactions.isnull().sum())

print("\nMissing values in products dataset:")
print(products.isnull().sum())

print("\nMissing values in customers dataset:")
print(customers.isnull().sum())

# Drop rows with missing values in transactions dataset
transactions.dropna(inplace=True)

# Drop rows with missing values in products dataset
products.dropna(inplace=True)

# Drop rows with missing values in customers dataset
customers.dropna(inplace=True)

# Correct data types
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])

# Remove duplicates if any
transactions.drop_duplicates(inplace=True)
products.drop_duplicates(inplace=True)
customers.drop_duplicates(inplace=True)

# Verify data cleaning
print("\nTransactions Dataset after cleaning:")
print(transactions.info())

print("\nProducts Dataset after cleaning:")
print(products.info())

print("\nCustomers Dataset after cleaning:")
print(customers.info())

# Summary Statistics
Generate summary statistics for the datasets to understand the basic metrics.

In [None]:
# Summary Statistics

# Summary statistics for transactions dataset
print("Summary Statistics for Transactions Dataset:")
print(transactions.describe())

# Summary statistics for products dataset
print("\nSummary Statistics for Products Dataset:")
print(products.describe())

# Summary statistics for customers dataset
print("\nSummary Statistics for Customers Dataset:")
print(customers.describe())

# Additional summary statistics for categorical columns in products dataset
print("\nSummary Statistics for Categorical Columns in Products Dataset:")
print(products.describe(include=['object']))

# Additional summary statistics for categorical columns in customers dataset
print("\nSummary Statistics for Categorical Columns in Customers Dataset:")
print(customers.describe(include=['object']))

# Distribution of Transactions Over Time
Analyze the distribution of transactions over different time periods (e.g., months, quarters).

In [None]:
# Distribution of Transactions Over Time

# Extract month and year from TransactionDate
transactions['YearMonth'] = transactions['TransactionDate'].dt.to_period('M')

# Group by YearMonth and count transactions
monthly_transactions = transactions.groupby('YearMonth').size()

# Plot the distribution of transactions over time
plt.figure(figsize=(12, 6))
monthly_transactions.plot(kind='bar', color='skyblue')
plt.title('Distribution of Transactions Over Time')
plt.xlabel('Year-Month')
plt.ylabel('Number of Transactions')
plt.xticks(rotation=45)
plt.show()

# Top Selling Products
Identify the top-selling products based on the total sales value and quantity sold.

In [None]:
# Top Selling Products

# Merge transactions with products to get product details
merged_data = pd.merge(transactions, products, on='ProductID')

# Group by ProductID and calculate total sales value and quantity sold
product_sales = merged_data.groupby('ProductID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).reset_index()

# Merge with products to get product names
product_sales = pd.merge(product_sales, products[['ProductID', 'ProductName']], on='ProductID')

# Sort by total sales value and quantity sold
top_selling_by_value = product_sales.sort_values(by='TotalValue', ascending=False).head(10)
top_selling_by_quantity = product_sales.sort_values(by='Quantity', ascending=False).head(10)

# Plot top selling products by total sales value
plt.figure(figsize=(12, 6))
sns.barplot(x='TotalValue', y='ProductName', data=top_selling_by_value, palette='viridis')
plt.title('Top 10 Selling Products by Total Sales Value')
plt.xlabel('Total Sales Value')
plt.ylabel('Product Name')
plt.show()

# Plot top selling products by quantity sold
plt.figure(figsize=(12, 6))
sns.barplot(x='Quantity', y='ProductName', data=top_selling_by_quantity, palette='viridis')
plt.title('Top 10 Selling Products by Quantity Sold')
plt.xlabel('Quantity Sold')
plt.ylabel('Product Name')
plt.show()

# Customer Demographics
Analyze customer demographics such as region and signup date.

In [None]:
# Customer Demographics

# Analyze customer demographics such as region and signup date

# Distribution of Customers by Region
plt.figure(figsize=(10, 6))
sns.countplot(y='Region', data=customers, palette='pastel', order=customers['Region'].value_counts().index)
plt.title('Distribution of Customers by Region')
plt.xlabel('Number of Customers')
plt.ylabel('Region')
plt.show()

# Distribution of Customer Signups Over Time
customers['SignupYearMonth'] = customers['SignupDate'].dt.to_period('M')
signup_trend = customers.groupby('SignupYearMonth').size()

plt.figure(figsize=(12, 6))
signup_trend.plot(kind='bar', color='lightcoral')
plt.title('Customer Signups Over Time')
plt.xlabel('Year-Month')
plt.ylabel('Number of Signups')
plt.xticks(rotation=45)
plt.show()

# Summary Statistics for Customer Signups by Region
signup_by_region = customers.groupby('Region').size().reset_index(name='Number of Signups')
print("Summary Statistics for Customer Signups by Region:")
print(signup_by_region)

# Average Signup Date by Region
avg_signup_date_by_region = customers.groupby('Region')['SignupDate'].mean().reset_index()
print("\nAverage Signup Date by Region:")
print(avg_signup_date_by_region)

# Plot Average Signup Date by Region
plt.figure(figsize=(10, 6))
sns.barplot(x='SignupDate', y='Region', data=avg_signup_date_by_region, palette='coolwarm')
plt.title('Average Signup Date by Region')
plt.xlabel('Average Signup Date')
plt.ylabel('Region')
plt.show()

# Sales by Region
Analyze sales distribution across different regions.

In [None]:
# Sales by Region

# Merge transactions with customers to get customer details
transactions_customers = pd.merge(transactions, customers, on='CustomerID')

# Group by Region and calculate total sales value and quantity sold
sales_by_region = transactions_customers.groupby('Region').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).reset_index()

# Plot total sales value by region
plt.figure(figsize=(12, 6))
sns.barplot(x='TotalValue', y='Region', data=sales_by_region, palette='magma')
plt.title('Total Sales Value by Region')
plt.xlabel('Total Sales Value')
plt.ylabel('Region')
plt.show()

# Plot quantity sold by region
plt.figure(figsize=(12, 6))
sns.barplot(x='Quantity', y='Region', data=sales_by_region, palette='magma')
plt.title('Quantity Sold by Region')
plt.xlabel('Quantity Sold')
plt.ylabel('Region')
plt.show()

# Display sales by region data
print("Sales by Region:")
print(sales_by_region)

# Derive Business Insights
Derive at least 5 business insights from the EDA and write them in short point-wise sentences.

In [None]:
# Derive Business Insights

# Insight 1: Top Selling Products by Total Sales Value
top_selling_by_value_insight = top_selling_by_value[['ProductName', 'TotalValue']]
print("Top Selling Products by Total Sales Value:")
print(top_selling_by_value_insight)

# Insight 2: Top Selling Products by Quantity Sold
top_selling_by_quantity_insight = top_selling_by_quantity[['ProductName', 'Quantity']]
print("\nTop Selling Products by Quantity Sold:")
print(top_selling_by_quantity_insight)

# Insight 3: Distribution of Customers by Region
customer_distribution_by_region = customers['Region'].value_counts().reset_index()
customer_distribution_by_region.columns = ['Region', 'Number of Customers']
print("\nDistribution of Customers by Region:")
print(customer_distribution_by_region)

# Insight 4: Customer Signups Over Time
customer_signups_over_time = signup_trend.reset_index()
customer_signups_over_time.columns = ['SignupYearMonth', 'Number of Signups']
print("\nCustomer Signups Over Time:")
print(customer_signups_over_time)

# Insight 5: Sales by Region
sales_by_region_insight = sales_by_region[['Region', 'TotalValue', 'Quantity']]
print("\nSales by Region:")
print(sales_by_region_insight)