In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 1. Load Data (Assumes the CSV is in the local directory)
df = pd.read_csv('consumer_complaints.csv')

# 2. Filter for relevant categories and columns
categories = ['Credit reporting, repair, or other', 'Debt collection', 'Consumer Loan', 'Mortgage']
df_filtered = df[df['Product'].isin(categories)].copy()

# Drop rows with missing consumer narrative (the feature)
df_filtered.dropna(subset=['Consumer complaint narrative'], inplace=True)

# 3. Target Encoding
# Map the text categories to numerical labels (0, 1, 2, 3)
label_encoder = LabelEncoder()
df_filtered['Category_ID'] = label_encoder.fit_transform(df_filtered['Product'])

# Select Features (X) and Target (y)
X = df_filtered['Consumer complaint narrative']
y = df_filtered['Category_ID']

# Split data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Data split complete. Training samples:", len(X_train))

Data split complete. Training samples: 412976


In [19]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# Ensure NLTK resources are downloaded (run once)
import nltk
try:
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    nltk.download('punkt')
    nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', str(text).lower())
    # Tokenize and remove stopwords
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to the training and testing sets
X_train_processed = X_train.apply(preprocess_text)
X_test_processed = X_test.apply(preprocess_text)

print("Text preprocessing complete.")

Text preprocessing complete.


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

# 1. Define the Pipeline (Feature Engineering + Classifier)
# Logistic Regression is a good baseline for multi-class text classification
text_classifier = Pipeline([
    # Feature Engineering
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_df=0.8, min_df=5)),
    # Model Selection
    ('classifier', LogisticRegression(solver='liblinear', multi_class='ovr', random_state=42))
])

# 2. Train the model
text_classifier.fit(X_train_processed, y_train)

print("Model training complete.")

Model training complete.


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Predict on the test set
y_pred = text_classifier.predict(X_test_processed)

# Print Evaluation Metrics
print("--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Print Confusion Matrix (Optional, but good for visualization)
print("--- Confusion Matrix ---")
print(confusion_matrix(y_test, y_pred))

In [23]:
# New sample complaint for testing
sample_complaint = [
    "I have been fighting with this bank for months about an escrow issue related to my mortgage payment."
]

# Preprocess the sample and predict
sample_processed = pd.Series(sample_complaint).apply(preprocess_text)
prediction_id = text_classifier.predict(sample_processed)[0]
prediction_label = label_encoder.inverse_transform([prediction_id])[0]

print("\n--- Model Prediction ---")
print("Input:", sample_complaint[0])
print("Predicted Category ID:", prediction_id)
print("Predicted Category:", prediction_label)


--- Model Prediction ---
Input: I have been fighting with this bank for months about an escrow issue related to my mortgage payment.
Predicted Category ID: 2
Predicted Category: Mortgage
