In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
import os

# Set plotting style
plt.style.use('seaborn-whitegrid')
sns.set(font_scale=1.2)

# Define the data directory
data_dir = './nortwind'

# Function to read all CSV files
def read_csv_files(directory):
    dataframes = {}
    for file in os.listdir(directory):
        if file.endswith('.csv'):
            file_path = os.path.join(directory, file)
            file_name = os.path.splitext(file)[0]
            try:
                df = pd.read_csv(file_path)
                dataframes[file_name] = df
                print(f"Loaded {file_name} with {df.shape[0]} rows and {df.shape[1]} columns")
            except Exception as e:
                print(f"Error loading {file}: {e}")
    return dataframes

# Load all CSV files
dfs = read_csv_files(data_dir)

# Basic exploration of each dataframe
def explore_dataframes(dataframes):
    for name, df in dataframes.items():
        print(f"\n=== {name} ===")
        print(f"Shape: {df.shape}")
        print("Columns:")
        for col in df.columns:
            print(f"  - {col}: {df[col].dtype}")
        print("Sample data:")
        print(df.head(3))
        print("="*50)

# Perform exploratory analysis
explore_dataframes(dfs)

# Sales Analysis
def analyze_sales(orders_df, order_details_df, products_df, customers_df):
    # Merge order details with products to get prices
    order_analysis = pd.merge(order_details_df, products_df,
                             on='product_id', how='left')

    # Calculate sales amount per order detail
    order_analysis['sales_amount'] = order_analysis['quantity'] * order_analysis['unit_price_x'] * (1 - order_analysis['discount'])

    # Aggregate sales by order
    order_sales = order_analysis.groupby('order_id')['sales_amount'].sum().reset_index()

    # Merge with orders table to get order dates and customer info
    order_sales = pd.merge(order_sales, orders_df, on='order_id', how='left')

    # Convert order date to datetime if it's not already
    order_sales['order_date'] = pd.to_datetime(order_sales['order_date'])

    # Extract month and year for time analysis
    order_sales['month'] = order_sales['order_date'].dt.month
    order_sales['year'] = order_sales['order_date'].dt.year
    order_sales['month_year'] = order_sales['order_date'].dt.to_period('M')

    # Merge with customers to get customer demographics
    sales_with_customers = pd.merge(order_sales, customers_df,
                                   on='customer_id', how='left')

    return order_analysis, order_sales, sales_with_customers

# Product Analysis
def analyze_products(order_details_df, products_df):
    # Merge order details with products
    product_sales = pd.merge(order_details_df, products_df,
                            on='product_id', how='left')

    # Calculate total quantity and revenue per product
    product_metrics = product_sales.groupby('product_id').agg(
        total_quantity=('quantity', 'sum'),
        total_revenue=lambda x: (x['quantity'] * x['unit_price_x'] * (1 - x['discount'])).sum(),
        avg_discount=('discount', 'mean')
    ).reset_index()

    # Merge with product info
    product_metrics = pd.merge(product_metrics, products_df[['product_id', 'product_name', 'category_id', 'supplier_id']],
                              on='product_id', how='left')

    return product_metrics

# Customer Analysis
def analyze_customers(orders_df, order_sales_df, customers_df):
    # Merge sales data with customers
    customer_sales = pd.merge(order_sales_df, customers_df,
                             on='customer_id', how='left')

    # Calculate metrics per customer
    customer_metrics = customer_sales.groupby('customer_id').agg(
        total_orders=('order_id', 'nunique'),
        total_revenue=('sales_amount', 'sum'),
        avg_order_value=('sales_amount', 'mean'),
        first_order=('order_date', 'min'),
        last_order=('order_date', 'max')
    ).reset_index()

    # Calculate days since last order (potential churn indicator)
    latest_date = orders_df['order_date'].max()
    customer_metrics['days_since_last_order'] = (pd.to_datetime(latest_date) - customer_metrics['last_order']).dt.days

    # Calculate customer lifetime
    customer_metrics['customer_lifetime_days'] = (customer_metrics['last_order'] - customer_metrics['first_order']).dt.days

    # Merge with customer info
    customer_metrics = pd.merge(customer_metrics,
                               customers_df[['customer_id', 'company_name', 'country', 'region', 'city']],
                               on='customer_id', how='left')

    return customer_metrics

# Geographic Analysis
def analyze_geography(customer_metrics):
    # Sales by country
    country_sales = customer_metrics.groupby('country').agg(
        total_revenue=('total_revenue', 'sum'),
        customer_count=('customer_id', 'nunique'),
        avg_revenue_per_customer=('total_revenue', 'mean')
    ).reset_index().sort_values(by='total_revenue', ascending=False)

    return country_sales

# Time Series Analysis
def analyze_time_series(order_sales_df):
    # Monthly sales trends
    monthly_sales = order_sales_df.groupby('month_year')['sales_amount'].sum()

    # Convert Period index to datetime for better plotting
    monthly_sales.index = monthly_sales.index.to_timestamp()

    return monthly_sales

# Calculate key performance indicators
def calculate_kpis(order_sales_df, customer_metrics):
    # Total Revenue
    total_revenue = order_sales_df['sales_amount'].sum()

    # Average Order Value
    avg_order_value = order_sales_df['sales_amount'].mean()

    # Number of Unique Customers
    unique_customers = customer_metrics['customer_id'].nunique()

    # Customer Lifetime Value
    clv = total_revenue / unique_customers

    # Potentially churned customers (no orders in last 90 days)
    potential_churn = customer_metrics[customer_metrics['days_since_last_order'] > 90]['customer_id'].count()
    churn_rate = potential_churn / unique_customers

    # Customer acquisition over time (by first order date)
    customer_acquisition = customer_metrics.groupby(customer_metrics['first_order'].dt.to_period('M')).size()

    return {
        'total_revenue': total_revenue,
        'avg_order_value': avg_order_value,
        'unique_customers': unique_customers,
        'customer_lifetime_value': clv,
        'potential_churn': potential_churn,
        'churn_rate': churn_rate,
        'customer_acquisition': customer_acquisition
    }

# Create visualizations
def create_visualizations(monthly_sales, product_metrics, country_sales, customer_metrics):
    # Set up figure size for the plots
    plt.figure(figsize=(12, 8))

    # Plot 1: Monthly Sales Trend
    plt.subplot(2, 2, 1)
    plt.plot(monthly_sales.index, monthly_sales.values, marker='o', linestyle='-')
    plt.title('Monthly Sales Trend')
    plt.xlabel('Month')
    plt.ylabel('Revenue')
    plt.xticks(rotation=45)
    plt.tight_layout()

    # Plot 2: Top 10 Products by Revenue
    plt.subplot(2, 2, 2)
    top_products = product_metrics.sort_values('total_revenue', ascending=False).head(10)
    sns.barplot(x='total_revenue', y='product_name', data=top_products)
    plt.title('Top 10 Products by Revenue')
    plt.xlabel('Revenue')
    plt.ylabel('Product')
    plt.tight_layout()

    # Plot 3: Sales by Country
    plt.subplot(2, 2, 3)
    top_countries = country_sales.head(10)
    sns.barplot(x='total_revenue', y='country', data=top_countries)
    plt.title('Sales by Country')
    plt.xlabel('Revenue')
    plt.ylabel('Country')
    plt.tight_layout()

    # Plot 4: Customer Recency vs Frequency (Potential Churn Analysis)
    plt.subplot(2, 2, 4)
    plt.scatter(customer_metrics['days_since_last_order'],
                customer_metrics['total_orders'],
                alpha=0.6)
    plt.title('Customer Recency vs Frequency')
    plt.xlabel('Days Since Last Order')
    plt.ylabel('Number of Orders')
    plt.tight_layout()

    plt.savefig('northwind_analysis.png', dpi=300, bbox_inches='tight')
    plt.close()

    # Additional plot: Average Order Value over time
    plt.figure(figsize=(10, 6))
    order_sales_df['month_year_dt'] = order_sales_df['month_year'].dt.to_timestamp()
    monthly_aov = order_sales_df.groupby('month_year_dt')['sales_amount'].mean()
    plt.plot(monthly_aov.index, monthly_aov.values, marker='o', linestyle='-', color='green')
    plt.title('Average Order Value Over Time')
    plt.xlabel('Month')
    plt.ylabel('Average Order Value')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('northwind_aov_trend.png', dpi=300, bbox_inches='tight')
    plt.close()

# For the purposes of this challenge, let's try to run the analyses with the expected dataframe names
# Note: These might need adjustment based on the actual CSV file structures
try:
    # Analyze sales data
    order_analysis, order_sales, sales_with_customers = analyze_sales(
        dfs.get('orders'), dfs.get('order_details'), dfs.get('products'), dfs.get('customers')
    )

    # Analyze product data
    product_metrics = analyze_products(dfs.get('order_details'), dfs.get('products'))

    # Analyze customer data
    customer_metrics = analyze_customers(dfs.get('orders'), order_sales, dfs.get('customers'))

    # Geographic analysis
    country_sales = analyze_geography(customer_metrics)

    # Time series analysis
    monthly_sales = analyze_time_series(order_sales)

    # Calculate KPIs
    kpis = calculate_kpis(order_sales, customer_metrics)

    # Create visualizations
    create_visualizations(monthly_sales, product_metrics, country_sales, customer_metrics)

    # Print key insights
    print("\n=== KEY INSIGHTS ===")
    print(f"Total Revenue: ${kpis['total_revenue']:,.2f}")
    print(f"Average Order Value: ${kpis['avg_order_value']:,.2f}")
    print(f"Customer Lifetime Value: ${kpis['customer_lifetime_value']:,.2f}")
    print(f"Potential Churn Rate: {kpis['churn_rate']*100:.2f}%")

    # Export key dataframes to CSV for further analysis
    order_sales.to_csv('order_sales_analysis.csv', index=False)
    product_metrics.to_csv('product_metrics.csv', index=False)
    customer_metrics.to_csv('customer_metrics.csv', index=False)
    country_sales.to_csv('country_sales.csv', index=False)

except Exception as e:
    print(f"Error in analysis: {e}")

OSError: 'seaborn-whitegrid' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)