In [None]:
# =========================
# Optimized Credit Risk EDA Notebook
# =========================

# Step 1: Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set(style="whitegrid")
%matplotlib inline

# Step 2: Load a sample of the data for faster EDA
df = pd.read_csv('../data/raw/data.csv', nrows=10000) # Adjust nrows if needed
print("Dataset shape (sample):", df.shape)
df.head()

# Step 3: Quick overview
print("\nColumns in dataset:", df.columns.tolist())
print("\nInfo:")
print(df.info())

# Step 4: Summary statistics
print("\nNumerical Summary:")
print(df.describe())

print("\nCategorical Summary:")
print(df.describe(include='object'))

# Step 5: Numerical feature distributions (optimized)
numerical_cols = ['Amount', 'Value']  # Add more if needed
for col in numerical_cols:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col], kde=True, bins=50)
    plt.title(f'Distribution of {col}')
    plt.show()

# Step 6: Categorical feature distributions (optimized)
categorical_cols = ['CurrencyCode', 'CountryCode', 'ProductCategory', 'ChannelId', 'PricingStrategy']
for col in categorical_cols:
    plt.figure(figsize=(7,4))
    sns.countplot(y=df[col], data=df, order=df[col].value_counts().index)
    plt.title(f'Distribution of {col}')
    plt.show()

# Step 7: Correlation analysis
plt.figure(figsize=(5,4))
corr_matrix = df[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

# Step 8: Missing values
missing = df.isnull().sum()
print("\nMissing values per column:\n", missing)

# Step 9: Outlier detection using boxplots
for col in numerical_cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

# Step 10: RFM proxy (Recency, Frequency, Monetary)
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])
# Recency
last_transaction = df.groupby('CustomerId')['TransactionStartTime'].max().reset_index()
last_transaction['Recency'] = (df['TransactionStartTime'].max() - last_transaction['TransactionStartTime']).dt.days
# Frequency
frequency = df.groupby('CustomerId')['TransactionId'].count().reset_index().rename(columns={'TransactionId': 'Frequency'})
# Monetary
monetary = df.groupby('CustomerId')['Amount'].sum().reset_index().rename(columns={'Amount': 'Monetary'})
# Merge RFM
rfm = last_transaction.merge(frequency, on='CustomerId').merge(monetary, on='CustomerId')
print("\nRFM proxy sample:")
print(rfm.head())


# Step 11: Key Insights (Markdown cell recommended in notebook)
## Key Insights
1. Amount and Value distributions are skewed; few extreme transactions exist.
2. CountryCode and ProductCategory distributions are uneven; some categories dominate.
3. Missing values detected in PricingStrategy and ChannelId; imputation required.
4. Refund transactions appear as negative Amounts; may need special handling.
5. Amount and Value are strongly correlated; consider feature redundancy.
6. RFM features (Recency, Frequency, Monetary) can be used as a proxy for credit risk.




SyntaxError: invalid syntax (1859512379.py, line 81)