In [12]:
import pandas as pd

# Preview only 1000 rows first to understand the structure
sample_df = pd.read_csv("../data/complaints.csv", nrows=1000)
sample_df.head()


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2025-06-20,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,,,Experian Information Solutions Inc.,FL,32092,,,Web,2025-06-20,In progress,Yes,,14195687
1,2025-06-20,Debt collection,Telecommunications debt,Attempts to collect debt not owed,Debt is not yours,,Company can't verify or dispute the facts in t...,"Eastern Account Systems of Connecticut, Inc.",FL,342XX,,,Web,2025-06-20,Closed with explanation,Yes,,14195688
2,2025-06-20,Credit reporting or other personal consumer re...,Credit reporting,Improper use of your report,Reporting company used your report improperly,,,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",AZ,85225,,,Web,2025-06-20,In progress,Yes,,14195689
3,2025-06-20,Credit reporting or other personal consumer re...,Credit reporting,Improper use of your report,Reporting company used your report improperly,,,Experian Information Solutions Inc.,AZ,85225,,,Web,2025-06-20,In progress,Yes,,14195690
4,2025-06-20,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Account status incorrect,,,Experian Information Solutions Inc.,IL,60628,,,Web,2025-06-20,In progress,Yes,,14195692


In [15]:
import pandas as pd

# Define which products to keep (for focused analysis)
TARGET_PRODUCTS = [
    "Credit card",
    "Payday loan",
    "Mortgage",
    "Student loan",
    "Bank account or service"
]

chunksize = 100_000  # Process 100K rows at a time
filtered_chunks = []

reader = pd.read_csv("../data/complaints.csv", chunksize=chunksize, low_memory=False)

for chunk in reader:
    chunk = chunk[chunk["Product"].isin(TARGET_PRODUCTS)]
    filtered_chunks.append(chunk)

# Combine all filtered chunks
filtered_df = pd.concat(filtered_chunks, ignore_index=True)

# Save the filtered version
filtered_df.to_csv("../data/filtered_complaints.csv", index=False)

print("âœ… Filtered data saved to data/filtered_complaints.csv")
print("ðŸ§¾ Final shape:", filtered_df.shape)


âœ… Filtered data saved to data/filtered_complaints.csv
ðŸ§¾ Final shape: (850403, 18)


In [16]:
# Basic info
filtered_df.info()

# Null values
filtered_df.isnull().sum()

# Top 5 product types
filtered_df["Product"].value_counts()

# Date range
print("From:", filtered_df["Date received"].min())
print("To:", filtered_df["Date received"].max())

# Complaint length (for later chunking)
filtered_df["complaint_length"] = filtered_df["Consumer complaint narrative"].astype(str).apply(len)
filtered_df["complaint_length"].describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850403 entries, 0 to 850402
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   Date received                 850403 non-null  object
 1   Product                       850403 non-null  object
 2   Sub-product                   755673 non-null  object
 3   Issue                         850403 non-null  object
 4   Sub-issue                     285649 non-null  object
 5   Consumer complaint narrative  280667 non-null  object
 6   Company public response       275856 non-null  object
 7   Company                       850403 non-null  object
 8   State                         840818 non-null  object
 9   ZIP code                      843101 non-null  object
 10  Tags                          135986 non-null  object
 11  Consumer consent provided?    462149 non-null  object
 12  Submitted via                 850403 non-null  object
 13 

count    850403.000000
mean        464.231059
std        1093.298759
min           3.000000
25%           3.000000
50%           3.000000
75%         518.000000
max       32785.000000
Name: complaint_length, dtype: float64

In [9]:
keep_cols = [
    'Date received',
    'Product',
    'Issue',
    'Consumer complaint narrative',
    'Company',
    'Submitted via',
    'Date sent to company',
    'Company response to consumer',
    'Timely response?',
    'Complaint ID'
]


df = df[keep_cols]
df.head()


Unnamed: 0,Date received,Product,Issue,Consumer complaint narrative,Company,Submitted via,Date sent to company,Company response to consumer,Timely response?,Complaint ID
0,2025-06-20,Credit reporting or other personal consumer re...,Incorrect information on your report,,Experian Information Solutions Inc.,Web,2025-06-20,In progress,Yes,14195687
1,2025-06-20,Debt collection,Attempts to collect debt not owed,,"Eastern Account Systems of Connecticut, Inc.",Web,2025-06-20,Closed with explanation,Yes,14195688
2,2025-06-20,Credit reporting or other personal consumer re...,Improper use of your report,,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",Web,2025-06-20,In progress,Yes,14195689
3,2025-06-20,Credit reporting or other personal consumer re...,Improper use of your report,,Experian Information Solutions Inc.,Web,2025-06-20,In progress,Yes,14195690
4,2025-06-20,Credit reporting or other personal consumer re...,Incorrect information on your report,,Experian Information Solutions Inc.,Web,2025-06-20,In progress,Yes,14195692


In [17]:
filtered_df = filtered_df.dropna(subset=["Consumer complaint narrative"])
print("Final size after dropping empty complaints:", filtered_df.shape)


Final size after dropping empty complaints: (280667, 19)
