In [181]:
import pandas as pd
import plotly.express as px
import re
import string
import nltk
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, KFold
from sklearn import metrics
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import ConfusionMatrixDisplay



In [126]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ajeym\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [127]:
df = pd.read_csv(r"C:\Users\ajeym\Documents\complaints.csv",dtype={'Consumer disputed?':str,'Tags':str,'Complaint ID':int}) 
df.query('Product in ["Debt collection", "Credit reporting, credit repair services, or other personal consumer reports", "Consumer Loan", "Mortgage"]',inplace=True)
df.reset_index(drop=True,inplace=True)

# Explanatory Data Analysis

In [128]:
df.head(2).T

Unnamed: 0,0,1
Date received,2023-12-26,2023-12-12
Product,Debt collection,Mortgage
Sub-product,I do not know,Conventional home mortgage
Issue,Written notification about debt,Trouble during payment process
Sub-issue,Didn't receive enough information to verify debt,"Escrow, taxes, or insurance"
Consumer complaint narrative,,I am writing to file a complaint against Selec...
Company public response,,Company believes it acted appropriately as aut...
Company,"EQUIFAX, INC.","SELECT PORTFOLIO SERVICING, INC."
State,CA,CA
ZIP code,90660,90621


In [129]:
df.columns

Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')

In [130]:
df.shape

(3112767, 18)

In [131]:
df.dtypes

Date received                   object
Product                         object
Sub-product                     object
Issue                           object
Sub-issue                       object
Consumer complaint narrative    object
Company public response         object
Company                         object
State                           object
ZIP code                        object
Tags                            object
Consumer consent provided?      object
Submitted via                   object
Date sent to company            object
Company response to consumer    object
Timely response?                object
Consumer disputed?              object
Complaint ID                     int32
dtype: object

In [132]:
fig = px.pie(df['Product'].value_counts(), values='Product', names=df['Product'].value_counts().index)
fig.update_layout(title_text='Distribution of Products')
fig.show()

# Cleaning Data

In [133]:
df1 = df[['Product', 'Consumer complaint narrative']].copy()
df1 = df1[pd.notnull(df1['Consumer complaint narrative'])]
df1.columns = ['Category', 'Complaint'] 
df1.head(10)

Unnamed: 0,Category,Complaint
1,Mortgage,I am writing to file a complaint against Selec...
3,"Credit reporting, credit repair services, or o...","To Whom May Concern Good Morning, I hope this ..."
4,"Credit reporting, credit repair services, or o...",I am a victim of identity-theft. I am writing ...
5,Debt collection,Please heed this notice very carefully. I beli...
6,"Credit reporting, credit repair services, or o...",In accordance with the Fair Credit Reporting a...
8,"Credit reporting, credit repair services, or o...",TransUnion is still reporting inaccurate infor...
9,Mortgage,This is the email I sent to Mr. Cooper which h...
10,Debt collection,I have allegedly have many accounts with Medic...
11,"Credit reporting, credit repair services, or o...",One or more things have been taken off of only...
13,"Credit reporting, credit repair services, or o...",In accordance with the Fair Credit Report Act ...


In [134]:
df1.isna().sum()

Category     0
Complaint    0
dtype: int64

In [135]:
duplicate = df1[df1.duplicated()]
len(duplicate) #o/p:233371
duplicate.head(5)

Unnamed: 0,Category,Complaint
226,"Credit reporting, credit repair services, or o...",My name is XXXX XXXX XXXX this complaint is no...
256,"Credit reporting, credit repair services, or o...",On ( XX/XX/2022 ) I sent a letter regarding in...
287,"Credit reporting, credit repair services, or o...",I'm really not sure what happened. I have mail...
304,"Credit reporting, credit repair services, or o...","When I reviewed my credit report, I discovered..."
373,"Credit reporting, credit repair services, or o...",On ( XX/XX/2022 ) I sent a letter regarding in...


In [136]:
df1=df1[~df1.duplicated()].reset_index(drop=True)
print(len(df1)/len(df), len(df1))  #Data reduced by 30% from original datasize 

0.2972451841079014 925255


In [148]:
df2 = df1.sample(15000, random_state=1).copy() #Sampled 15000 data to reduce time taken
df2.reset_index(drop=True,inplace=True)

In [183]:
df2['Category'].value_counts()

Credit reporting, credit repair services, or other personal consumer reports    9477
Debt collection                                                                 3499
Mortgage                                                                        1862
Consumer Loan                                                                    162
Name: Category, dtype: int64

In [149]:
df2.head(10)

Unnamed: 0,Category,Complaint
0,"Credit reporting, credit repair services, or o...",XXXXXXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XX...
1,"Credit reporting, credit repair services, or o...",""" In accordance with the Fair Credit Reporting..."
2,"Credit reporting, credit repair services, or o...","I told XXXX, XXXX and Equifax I looked at my c..."
3,"Credit reporting, credit repair services, or o...",XXXX XXXX XXXX response to my validation reque...
4,"Credit reporting, credit repair services, or o...",It was revealed in my credit report that inqui...
5,"Credit reporting, credit repair services, or o...",I was advised to contact you all regarding ite...
6,"Credit reporting, credit repair services, or o...",I have contacted the company several times in ...
7,"Credit reporting, credit repair services, or o...","in accordance with the fair credit act, XXXX X..."
8,Mortgage,I have been dealing with issues since this com...
9,"Credit reporting, credit repair services, or o...",To whom it may concern : Experian XXXX XXXX XX...


In [153]:
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^A-Za-z0-9 ]', '', text)
    text = re.sub(r'\bx+\b', '', text)
    words = text.split(" ")
    words = [word.strip() for word in words if not word in stop_words_list]
    text = " ".join(words)
    return text

df2['Complaint']=df2['Complaint'].apply(preprocess_text)
df2.head(10)

Unnamed: 0,Category,Complaint
0,"Credit reporting, credit repair services, or o...",direct violation fair credit report...
1,"Credit reporting, credit repair services, or o...",accordance fair credit reporting act item l...
2,"Credit reporting, credit repair services, or o...",told equifax looked consumer report noticed...
3,"Credit reporting, credit repair services, or o...",response validation request discovered tota...
4,"Credit reporting, credit repair services, or o...",revealed credit report inquiries made authoriz...
5,"Credit reporting, credit repair services, or o...",advised contact regarding items credit mine t...
6,"Credit reporting, credit repair services, or o...",contacted company several times past regarding...
7,"Credit reporting, credit repair services, or o...",accordance fair credit act account acco...
8,Mortgage,dealing issues since company bought loan 100 ...
9,"Credit reporting, credit repair services, or o...",may concern experian tx contesting inf...


# Feature engineering

In [154]:
#Splitting feature and target attributes
X=df2['Complaint']
Y=df2['Category']

In [155]:
#Encoding target attribute as given in the problem description
Y=Y.map({'Credit reporting, credit repair services, or other personal consumer reports':0,
        'Debt collection':1,
        'Consumer Loan':2,
        'Mortgage':3
        })

In [156]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [157]:
'''
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(min_df=0.02, max_df=0.80, sublinear_tf=True, ngram_range=(1, 2)))
    ('scale', StandardScaler()),
    ('svd', TruncatedSVD(n_components=50, random_state=42, n_iter=7))
])
'''
#Encoding feature attribute with TF-IDF
tfidf = TfidfVectorizer(min_df=0.02, max_df=0.80, sublinear_tf=True, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Building Model

In [188]:
svm_model = svm.SVC(kernel='rbf')
svm_model.fit(X_train_tfidf, y_train)
y_pred_ = svm_model.predict(X_test_tfidf)
f1_svm = f1_score(y_test, y_pred_,average='weighted')
print("F1 Score:", f1_svm)

F1 Score: 0.8767796794449323


In [164]:
neural_network = MLPClassifier(hidden_layer_sizes=(64,32),
                    random_state=5,
                    verbose=False,
                    max_iter=200,
                    learning_rate_init=0.01)

neural_network.fit(X_train_tfidf,y_train)
y_pred = neural_network.predict(X_test_tfidf)
f1_neural = f1_score(y_test, y_pred,average='weighted')
print("F1 Score:", f1_neural)

F1 Score: 0.8557921850775948


In [165]:
random_forest=RandomForestClassifier(n_estimators=100, max_depth=5, random_state=40)
random_forest.fit(X_train_tfidf,y_train)
y_pred = random_forest.predict(X_test_tfidf)
f1_forest = f1_score(y_test, y_pred,average='weighted')
print("F1 Score:", f1_forest)

F1 Score: 0.6676090367524765


In [167]:
naive_bayes=MultinomialNB()
naive_bayes.fit(X_train_tfidf,y_train)
y_pred = naive_bayes.predict(X_test_tfidf)
f1_naive = f1_score(y_test, y_pred,average='weighted')
print("F1 Score:", f1_naive)

F1 Score: 0.832431103809329


K-cross-fold Validation

In [168]:
naive_bayes_classifier=MultinomialNB()
svm_classifier=svm.SVC(kernel='rbf')
random_forest_classifier=RandomForestClassifier(n_estimators=100, max_depth=5, random_state=40)
neural_network_classifier = MLPClassifier(hidden_layer_sizes=(64,32),
                    random_state=5,
                    verbose=False,
                    max_iter=200,
                    learning_rate_init=0.01)

In [170]:
kf = KFold(n_splits=5, shuffle=True, random_state=43)
f1_scorer = make_scorer(f1_score, average='weighted')

X_encoded = TfidfVectorizer(min_df=0.02, max_df=0.80, sublinear_tf=True, ngram_range=(1, 2)).fit_transform(X)

model_names = []
mean_f1_scores = []

models = [svm_classifier, naive_bayes_classifier, random_forest_classifier, neural_network_classifier]

for model in models:
    cross_val_results = cross_val_score(model, X_encoded, Y, cv=kf, scoring=f1_scorer)
    model_names.append(type(model).__name__)
    mean_f1_scores.append(cross_val_results.mean())
    print(f'Model: {type(model).__name__}, Mean F1 Score: {cross_val_results.mean()}')

Model: SVC, Mean F1 Score: 0.8805228015386483
Model: MultinomialNB, Mean F1 Score: 0.8375416467402476
Model: RandomForestClassifier, Mean F1 Score: 0.674504262243428
Model: MLPClassifier, Mean F1 Score: 0.8575475804418395


In [171]:
model_score = {'model': model_names, 'f1 score': mean_f1_scores}
fig = px.bar(model_score, x='model', y='f1 score')
fig.show()

# Selection Of Model

Since SVM model has higher F1 score, it is selected for deployment

In [172]:
df.loc[1000, 'Consumer complaint narrative']

"This is a follow up to my previous complaint with regard to the settlement for homeowners who were unfairly denied loan modifications on their mortgage. I reached out to the Wells Fargo team in order to resolve this issue with a detail reason why I'm making my case. The bank supplied me with a copy of my signature accepting the deferment option, which i don't dispute its the fact that i explained to them in letter form that after repeated attempts to get a better resolution, out of frustration i just gave up."

In [173]:
df.loc[1000, 'Product']

'Mortgage'

In [186]:
def test(complaint):
    category_map={0:'Credit reporting, credit repair services, or other personal consumer reports',
        1:'Debt collection',
        2:'Consumer Loan',
        3:'Mortgage'
        }
    complaint=preprocess_text(complaint)
    complaint_encoded=tfidf.transform([complaint])
    prediction=svm_model.predict(complaint_encoded)
    #print(prediction)
    return category_map[prediction[0]]

complaint=input("Enter Complaint:")
print(test(complaint))

Enter Complaint:This is a follow up to my previous complaint with regard to the settlement for homeowners who were unfairly denied loan modifications on their mortgage. I reached out to the Wells Fargo team in order to resolve this issue with a detail reason why I'm making my case. The bank supplied me with a copy of my signature accepting the deferment option, which i don't dispute its the fact that i explained to them in letter form that after repeated attempts to get a better resolution, out of frustration i just gave up.
[3]
Mortgage
