In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("consumer_complaints_copy.csv")
df.head()

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
0,12-05-2014,Debt collection,Mortgage,Disclosure verification of debt,Not given enough info to verify debt,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",GA,30005,,,Referral,12-12-2014,Untimely response,No,No,1144671
1,11-10-2014,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",DE,19803,,,Referral,11/19/2014,Untimely response,No,No,1109287
2,08/26/2015,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",GA,30014,,,Referral,09-08-2015,Untimely response,No,No,1536776
3,01/16/2014,Debt collection,Mortgage,Disclosure verification of debt,Not given enough info to verify debt,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",GA,30087,,,Referral,02-11-2014,Untimely response,No,No,671539
4,06/25/2015,Mortgage,Conventional fixed mortgage,"Application, originator, mortgage broker",,My mortgage company has misrepresented themsel...,,"1st 2nd Mortgage Company Of NJ, Inc.",NJ,074XX,,Consent provided,Web,07/22/2015,Closed,Yes,No,1437506


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555957 entries, 0 to 555956
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   date_received                 555957 non-null  object
 1   product                       555957 non-null  object
 2   sub_product                   397635 non-null  object
 3   issue                         555957 non-null  object
 4   sub_issue                     212622 non-null  object
 5   consumer_complaint_narrative  66806 non-null   object
 6   company_public_response       85124 non-null   object
 7   company                       555957 non-null  object
 8   state                         551070 non-null  object
 9   zipcode                       551452 non-null  object
 10  tags                          77959 non-null   object
 11  consumer_consent_provided     123458 non-null  object
 12  submitted_via                 555957 non-null  object
 13 

In [4]:
df.isnull().sum()

date_received                        0
product                              0
sub_product                     158322
issue                                0
sub_issue                       343335
consumer_complaint_narrative    489151
company_public_response         470833
company                              0
state                             4887
zipcode                           4505
tags                            477998
consumer_consent_provided       432499
submitted_via                        0
date_sent_to_company                 0
company_response_to_consumer         0
timely_response                      0
consumer_disputed?                   0
complaint_id                         0
dtype: int64

In [5]:
print(df["product"].value_counts())


Mortgage                   186475
Debt collection            101052
Credit reporting            91854
Credit card                 66468
Bank account or service     62563
Consumer Loan               20990
Student loan                15839
Payday loan                  3877
Money transfers              3812
Prepaid card                 2470
Other financial service       557
Name: product, dtype: int64


In [6]:
# Data Preprocessing
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aniket\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aniket\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words  # Return list of words for Word2Vec training

df.fillna("", inplace=True)
df["text"] = (df["issue"] + " " + df["sub_issue"]).apply(clean_text)

In [8]:
# Encode target labels
product_encoder = LabelEncoder()
df["product_encoded"] = product_encoder.fit_transform(df["product"])

In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["product_encoded"], test_size=0.2, random_state=42)

In [11]:
#Train Word2Vec Model
word2vec_model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.train(X_train, total_examples=len(X_train), epochs=10)


(8842901, 19691080)

In [12]:
# Convert text data into Word2Vec feature vectors
def get_w2v_vectors(text_list, model, vector_size=100):
    vectors = []
    for words in text_list:
        word_vectors = [model.wv[word] for word in words if word in model.wv]
        if word_vectors:
            vectors.append(np.mean(word_vectors, axis=0))  # Take average of word vectors
        else:
            vectors.append(np.zeros(vector_size))  # If no words, use zero vector
    return np.array(vectors)

In [13]:
X_train_vec = get_w2v_vectors(X_train, word2vec_model)
X_test_vec = get_w2v_vectors(X_test, word2vec_model)

In [14]:
# Train XGBoost Model
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, eval_metric="mlogloss")
xgb_model.fit(X_train_vec, y_train)

In [15]:
# Model Evaluation
y_pred = xgb_model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9890


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [19]:
# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vec, y_train)


In [22]:
# Predict
y_pred_rf = rf_model.predict(X_test_vec)


In [21]:
# Accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")

Random Forest Accuracy: 0.9857
