In [1]:
# Load libraries and data
import pandas as pd
import numpy as np

# Load dataset with debugging
data_path = "../data/raw/transactions.csv"  # Update if file name/location differs
try:
    df = pd.read_csv(data_path, nrows=1000)  # Limit to 1000 rows for speed
    print("Dataset loaded successfully. Shape:", df.shape)
except FileNotFoundError:
    print("Error: File not found at", data_path, ". Please check the file path and ensure 'transactions.csv' is in 'data/raw/'.")
except Exception as e:
    print("Error loading dataset:", str(e))

# Generate insights only if data is loaded
if 'df' in locals():
    insights = []

    # Insight 1: Dataset Structure
    rows, cols = df.shape
    insights.append(f"1. **Dataset Structure**: The dataset has approximately {rows} rows and {cols} columns, including numerical features (e.g., Amount, Value) and categorical features (e.g., ProductCategory, ChannelId).")

    # Insight 2: Missing Values
    missing_values = df.isnull().sum()
    missing_percent = (missing_values / rows * 100).round(2)
    missing_insight = [col for col in missing_values.index if missing_values[col] > 0]
    if not missing_insight:
        insights.append("2. **Missing Values**: No significant missing values were observed in the sampled dataset.")
    else:
        insights.append(f"2. **Missing Values**: Columns {', '.join(missing_insight)} have missing values, with approximately {missing_percent[missing_insight].to_string(index=False)}% missing data respectively, suggesting potential imputation.")

    # Insight 3: Numerical Distributions
    amount_skew = df['Amount'].skew() if 'Amount' in df.columns else 0
    value_skew = df['Value'].skew() if 'Value' in df.columns else 0
    insights.append(f"3. **Numerical Distributions**: The 'Amount' column has a skewness of {amount_skew:.2f} {'(right-skewed)' if amount_skew > 0 else '(left-skewed)' if amount_skew < 0 else '(symmetric)'}, possibly due to varied transaction values. 'Value' has a skewness of {value_skew:.2f} {'(right-skewed)' if value_skew > 0 else '(left-skewed)' if value_skew < 0 else '(symmetric)'}.")  # Simplified

    # Insight 4: Categorical Variability
    if 'ProductCategory' in df.columns:
        category_counts = df['ProductCategory'].value_counts()
        dominant_category = category_counts.index[0] if not category_counts.empty else "N/A"
        dominant_count = category_counts.iloc[0] if not category_counts.empty else 0
        total_categories = len(category_counts)
        insights.append(f"4. **Categorical Variability**: The 'ProductCategory' column has {total_categories} unique categories, with '{dominant_category}' being the most frequent ({dominant_count} occurrences, {100 * dominant_count / rows:.1f}% of the sample).")
    else:
        insights.append("4. **Categorical Variability**: 'ProductCategory' data not available in sample.")

    # Insight 5: Correlations
    if 'Amount' in df.columns and 'Value' in df.columns:
        numerical_cols = df.select_dtypes(include=[np.number]).columns
        correlation_matrix = df[numerical_cols].corr()
        amount_value_corr = correlation_matrix.loc['Amount', 'Value']
        insights.append(f"5. **Correlations**: The 'Amount' and 'Value' columns have a correlation of {amount_value_corr:.2f}, indicating {'high' if abs(amount_value_corr) > 0.7 else 'moderate' if abs(amount_value_corr) > 0.3 else 'low'} similarity.")
    else:
        insights.append("5. **Correlations**: Correlation between 'Amount' and 'Value' could not be calculated due to missing columns.")

    # Print insights for report
    print("\n### EDA Insights for Report")
    for i, insight in enumerate(insights, 1):
        print(insight)

Dataset loaded successfully. Shape: (1000, 16)

### EDA Insights for Report
1. **Dataset Structure**: The dataset has approximately 1000 rows and 16 columns, including numerical features (e.g., Amount, Value) and categorical features (e.g., ProductCategory, ChannelId).
2. **Missing Values**: No significant missing values were observed in the sampled dataset.
3. **Numerical Distributions**: The 'Amount' column has a skewness of 16.95 (right-skewed), possibly due to varied transaction values. 'Value' has a skewness of 16.99 (right-skewed).
4. **Categorical Variability**: The 'ProductCategory' column has 8 unique categories, with 'financial_services' being the most frequent (523 occurrences, 52.3% of the sample).
5. **Correlations**: The 'Amount' and 'Value' columns have a correlation of 1.00, indicating high similarity.
