In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

try:
    raw_df = pd.read_csv('../data/complaints.csv', low_memory=False)
    print(raw_df.shape)
    print(raw_df.head())
except FileNotFoundError:
    print("Error: 'complaints.csv' not found. Please ensure it's in the 'data/' directory.")
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")

df = raw_df.copy()

df.info()
print(df.isnull().sum())
print(df.describe(include='all'))

product_counts = df['Product'].value_counts()
print(product_counts)

plt.figure(figsize=(12, 7))
sns.barplot(x=product_counts.index, y=product_counts.values, palette='viridis')
plt.title('Distribution of Complaints by Product')
plt.xlabel('Product')
plt.ylabel('Number of Complaints')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

df['Consumer complaint narrative'] = df['Consumer complaint narrative'].astype(str).fillna('')
df['narrative_length'] = df['Consumer complaint narrative'].apply(lambda x: len(x.split()))

print(df['narrative_length'].describe())

plt.figure(figsize=(10, 6))
sns.histplot(df['narrative_length'], bins=100, kde=True)
plt.title('Distribution of Consumer Complaint Narrative Lengths (Word Count)')
plt.xlabel('Word Count')
plt.ylabel('Number of Complaints')
plt.xlim(0, df['narrative_length'].quantile(0.99))
plt.tight_layout()
plt.show()

num_with_narrative = df[df['narrative_length'] > 0].shape[0]
num_without_narrative = df[df['narrative_length'] == 0].shape[0]
print(f"Number of complaints with narrative: {num_with_narrative}")
print(f"Number of complaints without narrative: {num_without_narrative}")
print(f"Total records: {df.shape[0]}")

allowed_products = [
    'Credit card',
    'Personal loan',
    'Buy Now, Pay Later (BNPL)',
    'Savings account',
    'Money transfer'
]

product_mapping_candidates = {
    'Credit card': ['Credit card', 'Credit card or prepaid card'],
    'Personal loan': ['Personal loan', 'Payday loan, title loan, or personal loan', 'Consumer loan'],
    'Buy Now, Pay Later (BNPL)': ['Consumer loan'],
    'Savings account': ['Checking or savings account', 'Savings account'],
    'Money transfer': ['Money transfer, virtual currency, or money service', 'Money transfer']
}

target_products_in_data = []
for challenge_product in allowed_products:
    if challenge_product in product_mapping_candidates:
        target_products_in_data.extend(product_mapping_candidates[challenge_product])
    else:
        target_products_in_data.append(challenge_product)

df_filtered = df[df['Product'].isin(target_products_in_data)].copy()
print(df_filtered.shape)
print(df_filtered['Product'].unique())

initial_rows = df_filtered.shape[0]
df_filtered = df_filtered[df_filtered['Consumer complaint narrative'].str.strip() != '']
df_filtered = df_filtered[df_filtered['Consumer complaint narrative'].str.lower() != 'nan']
rows_after_narrative_filter = df_filtered.shape[0]
print(f"Removed {initial_rows - rows_after_narrative_filter} records with empty narratives.")
print(df_filtered.shape)

def clean_narrative_text(text):
    text = str(text).lower()
    boilerplate_phrases = [
        "i am writing to file a complaint",
        "i am writing this letter to",
        "to whom it may concern",
        "my complaint is regarding",
        "this is a complaint about"
    ]
    for phrase in boilerplate_phrases:
        text = text.replace(phrase, "")
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_filtered['cleaned_narrative'] = df_filtered['Consumer complaint narrative'].apply(clean_narrative_text)

print(df_filtered[['Consumer complaint narrative', 'cleaned_narrative']].head())

output_path = 'data/filtered_complaints.csv'
df_filtered.to_csv(output_path, index=False)
print(f"Cleaned and filtered dataset saved to: {output_path}")

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

try:
    raw_df = pd.read_csv('data/complaints.csv', low_memory=False)
    print("Dataset loaded successfully!")
    print(f"Initial shape: {raw_df.shape}")
    print(raw_df.head())
except FileNotFoundError:
    print("Error: 'complaints.csv' not found. Please ensure it's in the 'data/' directory.")
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")

# This line should ideally be inside the try block or handled with an if raw_df is defined check
# But for now, let's see the output from the try-except first.
# df = raw_df.copy() # DO NOT RUN THIS LINE YET, just run the try-except part

Error: 'complaints.csv' not found. Please ensure it's in the 'data/' directory.


In [4]:
import os

# Print the current working directory of the Python script/notebook
current_dir = os.getcwd()
print(f"Python's current working directory is: {current_dir}")

# Expected: /home/y/intelligent_complaint_analysis/
# If it's something else, like /home/y/intelligent_complaint_analysis/notebooks/
# that's likely the problem.

Python's current working directory is: /home/y/intelligent_complaint_analysis/notebooks
