In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Load only necessary columns to save memory
df = pd.read_csv('../data/raw/complaints.csv', usecols=[
    'Date received', 'Product', 'Consumer complaint narrative', 'Complaint ID'
])

# Report initial shape
print(f"Initial dataset shape: {df.shape}")

# Drop rows with missing or empty narratives
df = df[df['Consumer complaint narrative'].notna()]
df = df[df['Consumer complaint narrative'].str.strip() != '']
print(f"After removing missing narratives: {df.shape}")

# Focus only on 5 target financial products
target_products = [
    'Credit card',
    'Personal loan',
    'Buy Now, Pay Later (BNPL)',
    'Savings account',
    'Money transfer, virtual currency, or currency exchange'
]
df = df[df['Product'].isin(target_products)]
print(f"After filtering by target products: {df.shape}")

# Clean narratives
def clean_text(text):
    text = text.lower()
    text = re.sub(r'i am writing.*?complaint', '', text)  # remove boilerplate
    text = re.sub(r'[^a-z0-9\s]', '', text)               # remove special chars
    text = re.sub(r'\s+', ' ', text)                      # normalize whitespace
    return text.strip()

df['cleaned_narrative'] = df['Consumer complaint narrative'].apply(clean_text)

# Add word count column for analysis
df['narrative_length'] = df['cleaned_narrative'].apply(lambda x: len(x.split()))

# EDA: Plot distribution of product counts
plt.figure(figsize=(10, 5))
sns.countplot(y='Product', data=df, order=df['Product'].value_counts().index)
plt.title('Complaint Count by Product')
plt.xlabel('Count')
plt.ylabel('Product')
plt.tight_layout()
plt.show()

# EDA: Plot distribution of narrative lengths
plt.figure(figsize=(10, 5))
sns.histplot(df['narrative_length'], bins=50, kde=True)
plt.title('Distribution of Cleaned Narrative Lengths (Words)')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Save filtered and cleaned data
df.to_csv('../data/filtered_complaints.csv', index=False)
print("✅ Cleaned dataset saved to '../data/filtered_complaints.csv'")
