In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os

# Create directories
os.makedirs('data/processed', exist_ok=True)
os.makedirs('notebooks', exist_ok=True)

# Load data
df = pd.read_csv('../data/raw/complaints.csv')

# 1. Data Structure
print("Shape:", df.shape)
print("\nData Types:\n", df.dtypes)

# 2. Summary Statistics
print("\nSummary Statistics:\n", df.describe())

# 3. Numerical Feature Distribution
plt.figure(figsize=(10, 6))
df['narrative_length'] = df['Consumer complaint narrative'].str.len().fillna(0)
sns.histplot(df['narrative_length'], bins=50)
plt.title('Distribution of Complaint Narrative Lengths')
plt.xlabel('Character Count')
plt.savefig('notebooks/narrative_length.png')
plt.show()

# 4. Categorical Feature Distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='Product', data=df)
plt.title('Distribution of Product Categories')
plt.xticks(rotation=45)
plt.savefig('notebooks/product_distribution.png')
plt.show()

# 5. Missing Values and Narrative Counts
print("\nMissing Values:\n", df.isnull().sum())
print("\nComplaints with narratives:", len(df[df['Consumer complaint narrative'].notnull()]))
print("Complaints without narratives:", len(df[df['Consumer complaint narrative'].isnull()]))

# 6. Filter for required products
products = ['Credit card', 'Consumer Loan', 'Payday loan', 'Checking or savings account', 'Money transfer']
df_filtered = df[df['Product'].isin(products)].copy()

# Rename products to match requirements
product_mapping = {
    'Credit card': 'Credit Cards',
    'Consumer Loan': 'Personal Loans',
    'Payday loan': 'Buy Now, Pay Later (BNPL)',
    'Checking or savings account': 'Savings Accounts',
    'Money transfer': 'Money Transfers'
}
df_filtered['Product'] = df_filtered['Product'].map(product_mapping)

# Remove empty narratives
df_filtered = df_filtered[df_filtered['Consumer complaint narrative'].notnull()]

# Clean narratives
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    return text

df_filtered['Consumer complaint narrative'] = df_filtered['Consumer complaint narrative'].apply(clean_text)

# Save filtered dataset
df_filtered.to_csv('../data/processed/filtered_complaints.csv', index=False)
print("\nFiltered dataset saved to data/processed/filtered_complaints.csv")
print("Filtered dataset shape:", df_filtered.shape)
print("Filtered product distribution:\n", df_filtered['Product'].value_counts())