In [6]:
import sys
from pathlib import Path
import pandas as pd

# 1. Compute project root (one level up from notebooks/)
project_root = Path.cwd().parent

# 2. Make src/ importable
sys.path.insert(0, str(project_root / "src"))

# 3. Define where your data lives
data_dir = project_root / "data"

# 4. Read your CSV
df = pd.read_csv(data_dir / "banking_complaints.csv")
df.head()


Unnamed: 0,Complaint ID,Date Received,Banking Product,Issue ID,Complaint Description,State,ZIP,Bank Response
0,CID76118977,1/1/2023,Checking or savings account,I_3510635,on XX/XX/XX22 I opened a safe balance account ...,California,92311,Closed with monetary relief
1,CID98703933,1/1/2023,"Credit reporting, credit repair services, or o...",I_3798538,There is an item from Bank of ABC on my credit...,California,91344,Closed with explanation
2,CID52036665,1/1/2023,Checking or savings account,I_3648593,On XX/XX/XX22 I found out that my account was ...,New York,10466,Closed with monetary relief
3,CID62581335,1/1/2023,Credit card or prepaid card,I_6999080,I've had a credit card for years with Bank of ...,California,92127,Closed with monetary relief
4,CID65731164,1/1/2023,Checking or savings account,I_3648593,This issue has to do with the way that Bank of...,New Jersey,7946,Closed with explanation


In [8]:
# Cell 2: Download NLTK data (one-time)
import nltk

# Stopwords for filtering, WordNet + omw for lemmatization
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sheilamcgovern/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sheilamcgovern/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sheilamcgovern/nltk_data...


True

In [None]:
from data_utils import load_data, check_types
from preprocess   import preprocessing

df = load_data(project_root/"data"/"banking_complaints.csv")

# check types & missing
check_types(df)

# apply preprocessing to the complaint text
df['cleaned_description'] = df['Complaint Description'].fillna("").apply(preprocessing)
df[['Complaint Description','cleaned_description']].head()


Complaint ID             object
Date Received            object
Banking Product          object
Issue ID                 object
Complaint Description    object
State                    object
ZIP                      object
Bank Response            object
dtype: object

Missing values per column:
Complaint ID              0
Date Received             0
Banking Product           0
Issue ID                  0
Complaint Description     0
State                    27
ZIP                      30
Bank Response             0
dtype: int64


In [None]:
# 4: Convert Date Received to datetime
df['Date Received'] = pd.to_datetime(df['Date Received'], errors='coerce')

# 5: Check the conversion
print(df['Date Received'].dtype)
print("Date range:", df['Date Received'].min(), "→", df['Date Received'].max())
