In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
from fpdf import FPDF
from PIL import Image, ImageDraw

# Set display options for better output readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

class NorthwindAnalyzer:
    """
    Class to handle the analysis of Northwind Traders data
    """
    def __init__(self, data_dir='./nortwind', results_dir='./results'):
        """Initialize the analyzer with paths for data and results"""
        self.data_dir = data_dir
        self.results_dir = results_dir
        self.dfs = {}  # Will store all dataframes
        self.figures = {}  # Will store all figure paths

        # Create results directory if it doesn't exist
        if not os.path.exists(results_dir):
            os.makedirs(results_dir)

    def load_data(self):
        """Load all CSV files from the data directory"""
        encodings = ['utf-8', 'latin1', 'ISO-8859-1']

        for file in os.listdir(self.data_dir):
            if file.endswith('.csv'):
                file_path = os.path.join(self.data_dir, file)
                file_name = os.path.splitext(file)[0]

                # Try different encodings
                for encoding in encodings:
                    try:
                        df = pd.read_csv(file_path, encoding=encoding)
                        self.dfs[file_name] = df
                        print(f"Loaded {file_name} with {df.shape[0]} rows and {df.shape[1]} columns")
                        break
                    except UnicodeDecodeError:
                        continue
                    except Exception as e:
                        print(f"Error loading {file}: {e}")

        return self.dfs

    def standardize_column_names(self):
        """Standardize column names across dataframes"""
        # Common mappings for column standardization
        column_mappings = {
            'orders': {
                'orderid': 'order_id',
                'order id': 'order_id',
                'customerid': 'customer_id',
                'customer id': 'customer_id',
                'orderdate': 'order_date',
                'order date': 'order_date'
            },
            'order_details': {
                'orderid': 'order_id',
                'order id': 'order_id',
                'productid': 'product_id',
                'product id': 'product_id',
                'unitprice': 'unit_price'
            },
            'products': {
                'productid': 'product_id',
                'product id': 'product_id',
                'productname': 'product_name',
                'product name': 'product_name',
                'categoryid': 'category_id',
                'category id': 'category_id',
                'supplierid': 'supplier_id',
                'supplier id': 'supplier_id',
                'unitprice': 'unit_price'
            },
            'customers': {
                'customerid': 'customer_id',
                'customer id': 'customer_id',
                'companyname': 'company_name',
                'company name': 'company_name'
            }
        }

        # Apply mappings to each dataframe
        for df_name, df in self.dfs.items():
            if df_name in column_mappings:
                for old_col, new_col in column_mappings[df_name].items():
                    for col in df.columns:
                        if col.lower().replace(' ', '') == old_col.lower().replace(' ', ''):
                            df.rename(columns={col: new_col}, inplace=True)

        # Convert dates to datetime format
        if 'orders' in self.dfs and 'order_date' in self.dfs['orders'].columns:
            self.dfs['orders']['order_date'] = pd.to_datetime(self.dfs['orders']['order_date'], errors='coerce')

        return self.dfs

    def analyze_sales(self):
        """Analyze sales data"""
        # Ensure required dataframes are available
        required_dfs = ['orders', 'order_details', 'products']
        for df_name in required_dfs:
            if df_name not in self.dfs:
                print(f"Error: {df_name} dataframe is missing")
                return None, None

        # Extract the dataframes
        orders_df = self.dfs['orders']
        order_details_df = self.dfs['order_details']
        products_df = self.dfs['products']

        # Merge order details with products
        order_analysis = pd.merge(order_details_df, products_df,
                                on='product_id', how='left')

        # Handle possible unit price column naming variations
        unit_price_col = 'unit_price'
        if 'unit_price_x' in order_analysis.columns:
            unit_price_col = 'unit_price_x'

        # Calculate sales amount per order detail
        order_analysis['sales_amount'] = order_analysis['quantity'] * order_analysis[unit_price_col] * (1 - order_analysis['discount'])

        # Aggregate sales by order
        order_sales = order_analysis.groupby('order_id')['sales_amount'].sum().reset_index()

        # Merge with orders table to get order dates and customer info
        order_sales = pd.merge(order_sales, orders_df, on='order_id', how='left')

        # Extract month and year for time analysis
        if 'order_date' in order_sales.columns:
            order_sales['month'] = order_sales['order_date'].dt.month
            order_sales['year'] = order_sales['order_date'].dt.year
            order_sales['month_year'] = order_sales['order_date'].dt.to_period('M')
            order_sales['month_year_dt'] = order_sales['month_year'].dt.to_timestamp()

        return order_analysis, order_sales

    def analyze_products(self, order_analysis):
        """Analyze product sales and performance"""
        if 'products' not in self.dfs:
            print("Error: products dataframe is missing")
            return None

        products_df = self.dfs['products']

        # Handle possible unit price column naming variations
        unit_price_col = 'unit_price'
        if 'unit_price_x' in order_analysis.columns:
            unit_price_col = 'unit_price_x'

        # Calculate total quantity and revenue per product
        product_metrics = order_analysis.groupby('product_id').agg(
            total_quantity=('quantity', 'sum'),
            total_revenue=lambda x: (x['quantity'] * x[unit_price_col] * (1 - x['discount'])).sum(),
            avg_discount=('discount', 'mean')
        ).reset_index()

        # Merge with product info
        product_cols = ['product_id', 'product_name']