# EDA for Complaint Analysis
## Task 1: Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import re

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

In [None]:
# Load data
data_path = Path('../data/raw/complaints.csv')
df = pd.read_csv(data_path, low_memory=False)
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
df.head()

In [None]:
# Basic info
df.info()
print(f"\nMissing values per column:")
print(df.isnull().sum().sort_values(ascending=False).head(10))

In [None]:
# Product distribution
product_counts = df['Product'].value_counts()
print("Top 10 Products:")
print(product_counts.head(10))

plt.figure(figsize=(12, 6))
product_counts.head(15).plot(kind='bar')
plt.title('Complaints by Product Category')
plt.xlabel('Product')
plt.ylabel('Number of Complaints')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Filter for required products
required_products = ['Credit card', 'Personal loan', 'Savings account', 'Money transfers']
df_filtered = df[df['Product'].isin(required_products)].copy()
print(f"After filtering: {df_filtered.shape}")
print(f"Products in filtered data:")
print(df_filtered['Product'].value_counts())

In [None]:
# Check narratives
print(f"Total rows: {len(df_filtered)}")
print(f"Rows with narrative: {df_filtered['Consumer complaint narrative'].notna().sum()}")
print(f"Rows without narrative: {df_filtered['Consumer complaint narrative'].isna().sum()}")
print(f"Percentage with narrative: {df_filtered['Consumer complaint narrative'].notna().mean():.2%}")

df_with_narrative = df_filtered[df_filtered['Consumer complaint narrative'].notna()].copy()
print(f"\nDataset with narratives: {df_with_narrative.shape}")

In [None]:
# Text length analysis
df_with_narrative['narrative_length'] = df_with_narrative['Consumer complaint narrative'].str.len()
df_with_narrative['word_count'] = df_with_narrative['Consumer complaint narrative'].str.split().str.len()

print(f"Average characters: {df_with_narrative['narrative_length'].mean():.0f}")
print(f"Average words: {df_with_narrative['word_count'].mean():.0f}")
print(f"Max characters: {df_with_narrative['narrative_length'].max()}")
print(f"Max words: {df_with_narrative['word_count'].max()}")

# Plot distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(df_with_narrative['narrative_length'], bins=50, edgecolor='black')
axes[0].set_title('Distribution of Narrative Length (Characters)')
axes[0].set_xlabel('Character Count')
axes[0].set_ylabel('Frequency')

axes[1].hist(df_with_narrative['word_count'], bins=50, edgecolor='black', color='orange')
axes[1].set_title('Distribution of Narrative Length (Words)')
axes[1].set_xlabel('Word Count')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Clean text function
def clean_narrative(text):
    if pd.isna(text):
        return ""
    
    text = str(text)
    # Remove common boilerplate
    text = re.sub(r'XX/XX/XXXX', '', text)
    text = re.sub(r'\d{2}/\d{2}/\d{4}', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

# Test cleaning
sample_text = df_with_narrative['Consumer complaint narrative'].iloc[0]
print("Original (first 500 chars):")
print(sample_text[:500])
print("\nCleaned (first 500 chars):")
print(clean_narrative(sample_text)[:500])

In [None]:
# Apply cleaning
df_with_narrative['cleaned_narrative'] = df_with_narrative['Consumer complaint narrative'].apply(clean_narrative)
df_with_narrative['cleaned_word_count'] = df_with_narrative['cleaned_narrative'].str.split().str.len()

print(f"Original avg words: {df_with_narrative['word_count'].mean():.1f}")
print(f"Cleaned avg words: {df_with_narrative['cleaned_word_count'].mean():.1f}")
print(f"Percentage reduction: {(1 - df_with_narrative['cleaned_word_count'].mean()/df_with_narrative['word_count'].mean()):.2%}")

In [None]:
# Save processed data
output_path = Path('../data/processed/filtered_complaints.csv')
output_path.parent.mkdir(parents=True, exist_ok=True)

# Save only necessary columns
df_with_narrative[['Complaint ID', 'Product', 'cleaned_narrative', 'Date received']].to_csv(output_path, index=False)
print(f"Saved to: {output_path}")
print(f"Records saved: {len(df_with_narrative)}")

In [None]:
# Summary statistics
summary = pd.DataFrame({
    'Product': df_with_narrative['Product'].value_counts().index,
    'Count': df_with_narrative['Product'].value_counts().values,
    'Avg Words': [df_with_narrative[df_with_narrative['Product'] == p]['cleaned_word_count'].mean().round(1) 
                  for p in df_with_narrative['Product'].value_counts().index]
})

print("Summary by Product:")
print(summary)

# Save summary
summary.to_csv('../data/processed/product_summary.csv', index=False)
print("\nSummary saved to data/processed/product_summary.csv")