In [3]:
import requests
import zipfile
import os

url = "https://files.consumerfinance.gov/ccdb/complaints.csv.zip"

# Paths
base_dir = r"C:\Users\hp\Pictures\financial intelligence\Intelligent-Complaint-Analysis-for-Financial-Services\rag-complaint-chatbot\data\raw"
os.makedirs(base_dir, exist_ok=True)

zip_path = os.path.join(base_dir, "complaints.zip")

# Stream download
with requests.get(url, stream=True, timeout=60) as r:
    r.raise_for_status()
    total = int(r.headers.get("Content-Length", 0))
    downloaded = 0

    with open(zip_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024 * 1024):  # 1 MB chunks
            if chunk:
                f.write(chunk)
                downloaded += len(chunk)
                print(f"\rDownloaded {downloaded / 1e6:.1f} MB", end="")

print("\nDownload complete.")

# Unzip
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(base_dir)

print("Extraction complete.")


Downloaded 1681.8 MB
Download complete.
Extraction complete.


In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re  # For text cleaning
import os

# Set seaborn style for visualizations
sns.set(style="whitegrid")

# Load the dataset (use low_memory=False to avoid dtype warnings)
data_path = r"C:\Users\hp\Pictures\financial intelligence\Intelligent-Complaint-Analysis-for-Financial-Services\rag-complaint-chatbot\data\raw\complaints.csv"
df = pd.read_csv(data_path, low_memory=False)

# Display basic info
print(df.info())  # Shows column names, non-null counts, dtypes
print(df.head())  # First 5 rows
print(f"Total rows: {len(df)}")  # Should be ~4M+ depending on download date

In [None]:
# Step 2.1: Analyze distribution of complaints across Products
product_distribution = df['Product'].value_counts()
print("Product Distribution:\n", product_distribution)

# Visualize
plt.figure(figsize=(12, 6))
sns.barplot(x=product_distribution.index, y=product_distribution.values)
plt.xticks(rotation=90)
plt.title('Complaint Distribution by Product')
plt.xlabel('Product')
plt.ylabel('Count')
plt.show()

# Step 2.2 Calculate and visualize narrative lengths (word count)
# Add a column for word count (handle NaN narratives)
df['narrative_length'] = df['Consumer complaint narrative'].apply(
    lambda x: len(str(x).split()) if pd.notnull(x) else 0
)

# Statistics
print("Narrative Length Stats:\n", df['narrative_length'].describe())

# Visualize histogram
plt.figure(figsize=(10, 5))
sns.histplot(df['narrative_length'], bins=50, kde=True)
plt.title('Distribution of Narrative Word Counts')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.show()

# Identify very short (<10 words) or very long (>1000 words) narratives
short_narratives = df[df['narrative_length'] < 10]
long_narratives = df[df['narrative_length'] > 1000]
print(f"Short narratives: {len(short_narratives)}")
print(f"Long narratives: {len(long_narratives)}")

# Step 4.3: Identify complaints with and without narratives
with_narrative = df['Consumer complaint narrative'].notnull().sum()
without_narrative = df['Consumer complaint narrative'].isnull().sum()
print(f"Complaints with narratives: {with_narrative}")
print(f"Complaints without narratives: {without_narrative}")

# Visualize
plt.figure(figsize=(6, 6))
plt.pie([with_narrative, without_narrative], labels=['With Narrative', 'Without Narrative'], autopct='%1.1f%%')
plt.title('Complaints With vs Without Narratives')
plt.show()

In [None]:
# Define the target products (match CFPB naming closely; adjust based on your EDA)
target_products = [
    'Credit card',  # May appear as 'Credit card or prepaid card'
    'Consumer Loan',  # For Personal Loans
    'Bank account or service',  # For Savings Accounts
    'Money transfers'  # For Money Transfers
]
# Note: Check exact names from your product_distribution print. CFPB uses:
# - 'Credit card' or 'Credit card or prepaid card'
# - 'Consumer Loan' for personal loans
# - 'Bank account or service' for savings
# - 'Money transfer' or 'Money transfers'

# Filter to target products
filtered_df = df[df['Product'].isin(target_products)]

# Remove records with empty narratives
filtered_df = filtered_df[filtered_df['Consumer complaint narrative'].notnull()]

# Additional filter: Remove very short narratives (optional, e.g., <10 words)
filtered_df = filtered_df[filtered_df['narrative_length'] >= 10]

print(f"Filtered rows: {len(filtered_df)}")

In [None]:
# Define a cleaning function
def clean_narrative(text):
    if pd.isnull(text):
        return ""
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    # Remove boilerplate (example patterns; customize based on EDA)
    boilerplate_patterns = [
        r'i am writing to file a complaint',
        r'this is a formal complaint'
    ]
    for pattern in boilerplate_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    return text

# Apply cleaning
filtered_df['cleaned_narrative'] = filtered_df['Consumer complaint narrative'].apply(clean_narrative)

# Optional: Other normalizations (e.g., remove stopwords using NLTK)
# import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# stop_words = set(stopwords.words('english'))
# filtered_df['cleaned_narrative'] = filtered_df['cleaned_narrative'].apply(
#     lambda x: ' '.join([word for word in x.split() if word not in stop_words])
# )

# Verify cleaning
print(filtered_df[['Consumer complaint narrative', 'cleaned_narrative']].head())

In [None]:
# Save to CSV
output_path = r"c:\Users\hp\Pictures\financial intelligence\Intelligent-Complaint-Analysis-for-Financial-Services\rag-complaint-chatbot\data\processed/filtered_complaints.csv"  # Note: The task says data/filtered_complaints.csv, but structure has data/processed/
os.makedirs(r"c:\Users\hp\Pictures\financial intelligence\Intelligent-Complaint-Analysis-for-Financial-Services\rag-complaint-chatbot\data\processed", exist_ok=True)
filtered_df.to_csv(output_path, index=False)
print(f"Saved filtered dataset to {output_path}")