In [None]:
# Save as notebooks/1.0-eda.py (or convert to .ipynb)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/raw/data.csv')

# Overview
print("Dataset Info:")
print(df.info())
print("\nSummary Statistics:")
print(df.describe())
print(df.describe(include='object'))

# Numerical distributions
numerical_cols = ['Amount', 'Value', 'CountryCode', 'PricingStrategy']
for col in numerical_cols:
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()

# Categorical distributions
categorical_cols = ['ProductCategory', 'ChannelId', 'ProviderId', 'ProductId']
for col in categorical_cols:
    sns.countplot(y=df[col])
    plt.title(f'Distribution of {col}')
    plt.show()

# Correlation
corr = df[numerical_cols].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Missing values
print("\nMissing Values:")
print(df.isnull().sum())
sns.heatmap(df.isnull(), cbar=False)
plt.title('Missing Values')
plt.show()

# Outliers
for col in numerical_cols:
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

# Insights
print("""
Top Insights:
1. 'Amount' is skewed with both positive (debits) and negative (credits) values.
2. 'ProductCategory' is dominated by categories like 'airtime'.
3. Minimal missing values in the dataset.
4. Low correlation between numerical features.
5. Outliers present in 'Amount' and 'Value'.
""")