In [1]:
!pip install pandas numpy scikit-learn spacy nltk xgboost wordcloud
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------- ----------------------------- 3.4/12.8 MB 20.2 MB/s eta 0:00:01
     --------------------------- ------------ 8.9/12.8 MB 23.1 MB/s eta 0:00:01
     ------------------------------- ------- 10.5/12.8 MB 23.4 MB/s eta 0:00:01
     ---------------------------------- ---- 11.3/12.8 MB 13.8 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 14.1 MB/s  0:00:00
[38;5;2mâœ” Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Imports and Configuration

In [2]:
import json
import numpy as np
import pandas as pd
import re, string
import spacy

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

import warnings
warnings.filterwarnings('ignore')

# Data Loading and Cleaning

In [3]:
# Load Data
try:
    f = open('complaints-2021-05-14_08_16.json')
    data = json.load(f)
    df = pd.json_normalize(data)
    print(f"Data Loaded. Shape: {df.shape}")
except FileNotFoundError:
    print("Error: 'complaints-2021-05-14_08_16.json' not found. Please check the path.")

# Rename columns for easier access
df.rename(columns={'_source.complaint_what_happened': 'complaint_text', 
                   '_source.product': 'product'}, inplace=True)

# Filter out blank complaints
df = df[df['complaint_text'].astype(bool)] # Removes empty strings
df = df[df['complaint_text'] != ""]

# Text Cleaning Function
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    # Lemmatization
    doc = nlp(text)
    lemmatized = " ".join([token.lemma_ for token in doc if token.pos_ in ['NOUN', 'PROPN']])
    return lemmatized.strip()

df['cleaned_complaint'] = df['complaint_text'].apply(clean_text)
print("Text cleaning complete.")

Data Loaded. Shape: (78313, 22)
Text cleaning complete.


# Feature Extraction & Topic Modelling (NMF)

In [4]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = tfidf.fit_transform(df['cleaned_complaint'])

# NMF Decomposition (5 Topics)
num_topics = 5
nmf_model = NMF(n_components=num_topics, random_state=40)
nmf_model.fit(dtm)

# Assign Topics to Documents
topic_results = nmf_model.transform(dtm)
df['topic_id'] = topic_results.argmax(axis=1)

# Manual Label Mapping

In [5]:
# Print Top 15 words for each topic
feature_names = tfidf.get_feature_names_out()

for index, topic in enumerate(nmf_model.components_):
    print(f"\nTOPIC {index} Top Words:")
    print([feature_names[i] for i in topic.argsort()[-15:]])

topic_mapping = {
    0: 'Bank account services',      
    1: 'Credit card / Prepaid card', 
    2: 'Others',                     
    3: 'Theft/Dispute reporting',    
    4: 'Mortgages/loans'             
}

df['Topic_Label'] = df['topic_id'].map(topic_mapping)
print("\nLabels assigned. Sample:")
print(df[['cleaned_complaint', 'Topic_Label']].head())


TOPIC 0 Top Words:
['xxxxxxxxxxxx', 'company', 'address', 'document', 'letter', 'information', 'debt', 'complaint', 'phone', 'email', 'money', 'number', 'bank', 'chase', 'xxxx']

TOPIC 1 Top Words:
['letter', 'debt', 'information', 'year', 'application', 'limit', 'company', 'balance', 'score', 'account', 'chase', 'inquiry', 'report', 'card', 'credit']

TOPIC 2 Top Words:
['balance', 'customer', 'transaction', 'business', 'day', 'branch', 'fee', 'checking', 'deposit', 'fund', 'money', 'chase', 'check', 'bank', 'account']

TOPIC 3 Top Words:
['auto', 'statement', 'fee', 'rate', 'balance', 'time', 'year', 'xxxxxxxx', 'home', 'modification', 'month', 'chase', 'mortgage', 'loan', 'payment']

TOPIC 4 Top Words:
['day', 'service', 'fee', 'letter', 'statement', 'purchase', 'fraud', 'claim', 'merchant', 'card', 'transaction', 'dispute', 'chase', 'charge', 'xxxxxxxx']

Labels assigned. Sample:
                                    cleaned_complaint  \
1   morning name xxxx xxxx stop bank cardmemb

# Model Selection

In [6]:
# Define X and y
X = dtm # The TF-IDF matrix
y = df['topic_id']

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40)

# Dictionary of Classifiers
models = {
    "Logistic Regression": LogisticRegression(solver='liblinear'),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    "Naive Bayes": MultinomialNB()
}

def evaluate_models(X_train, y_train, X_test, y_test, models):
    model_report = {}
    print(f"Training {len(models)} models...\n")
    
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = f1_score(y_test, y_pred, average='weighted')
        
        model_report[model_name] = score
        print(f"{model_name} F1 Score: {score:.4f}")
        
    return model_report, models

# Run Evaluation
report, trained_models = evaluate_models(X_train, y_train, X_test, y_test, models)

# Select Best Model
best_score = max(report.values())
best_model_name = max(report, key=report.get)
best_model = trained_models[best_model_name]

print(f"\n------------------------------------------------")
print(f"WINNER: {best_model_name} with F1 Score: {best_score:.4f}")
print(f"------------------------------------------------")

Training 7 models...

Logistic Regression F1 Score: 0.9549
Decision Tree F1 Score: 0.8159
Random Forest F1 Score: 0.8432
AdaBoost F1 Score: 0.7781
Gradient Boosting F1 Score: 0.9216
XGBoost F1 Score: 0.9299
Naive Bayes F1 Score: 0.7021

------------------------------------------------
WINNER: Logistic Regression with F1 Score: 0.9549
------------------------------------------------


# Final Inference (Prediction System)

In [7]:
def predict_complaint_category(text):
    # Clean the text
    clean_text_input = clean_text(text)
    
    # Transform using the fitted TF-IDF
    vectorized_input = tfidf.transform([clean_text_input])
    
    # Predict using the best model
    prediction_id = best_model.predict(vectorized_input)[0]
    
    # Map ID back to Category Name
    category_name = topic_mapping[prediction_id]
    
    return category_name

sample_complaint = "I applied for a loan to buy a house but the interest rate was different from what was promised."
result = predict_complaint_category(sample_complaint)

print(f"Input Complaint: {sample_complaint}")
print(f"Predicted Department: {result}")

Input Complaint: I applied for a loan to buy a house but the interest rate was different from what was promised.
Predicted Department: Theft/Dispute reporting
