# Multiclass Classification Text

In [1]:
import string
import numpy as np
import pandas as pd

# Data Cleaning
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 

# Model
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Loading data

data = pd.read_csv('Consumer_Complaints.csv')

In [3]:
data.dropna(subset=['Consumer Complaint'], how='all', inplace=True)

In [4]:
data

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer Complaint,Company Public Response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date Sent to Company,Company Response to Consumer,Timely response?,Consumer disputed?,Complaint ID,Unnamed: 18
1,10-01-2016,Credit reporting,,Incorrect information on credit report,Account status,I have outdated information on my credit repor...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",AL,352XX,,Consent provided,Web,10-05-2016,Closed with explanation,Yes,No,2141773,
2,10/17/2016,Consumer Loan,Vehicle loan,Managing the loan or lease,,I purchased a new car on XXXX XXXX. The car de...,,"CITIZENS FINANCIAL GROUP, INC.",PA,177XX,Older American,Consent provided,Web,10/20/2016,Closed with explanation,Yes,No,2163100,
7,06/15/2015,Credit reporting,,Credit reporting company's investigation,Inadequate help over the phone,An account on my credit report has a mistaken ...,Company chooses not to provide a public response,Experian Information Solutions Inc.,VA,224XX,,Consent provided,Web,06/15/2015,Closed with explanation,Yes,No,1420702,
12,02-03-2016,Debt collection,"Other (i.e. phone, health club, etc.)",Disclosure verification of debt,Not given enough info to verify debt,This company refuses to provide me verificatio...,,"The CBE Group, Inc.",TX,752XX,,Consent provided,Web,02-03-2016,Closed with explanation,Yes,Yes,1772196,
16,02/17/2016,Debt collection,Credit card,Improper contact or sharing of info,Talked to a third party about my debt,This complaint is in regards to Square Two Fin...,Company has responded to the consumer and the ...,SQUARETWO FINANCIAL CORPORATION,NE,693XX,,Consent provided,Web,03-04-2016,Closed with explanation,Yes,Yes,1790634,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1025002,11-09-2016,Debt collection,Medical,False statements or representation,Attempted to collect wrong amount,Our son was taken to XXXX XXXX XXXX XXXX XXXX ...,Company believes complaint caused principally ...,R & B Corporation of Virginia,NJ,077XX,,Consent provided,Web,11-09-2016,Closed with explanation,Yes,No,2201681,
1025003,01/22/2016,Bank account or service,Checking account,Deposits and withdrawals,,"On XXXX/XXXX/13, without my authorization, Ban...",Company chooses not to provide a public response,"BANK OF AMERICA, NATIONAL ASSOCIATION",FL,347XX,,Consent provided,Web,01/22/2016,Closed with monetary relief,Yes,No,1753439,
1025006,02-07-2017,Debt collection,"Other (i.e. phone, health club, etc.)",Cont'd attempts collect debt not owed,Debt is not mine,I had an account with XXXX in XX/XX/XXXX this ...,Company believes it acted appropriately as aut...,ERC,NY,115XX,Servicemember,Consent provided,Web,02-07-2017,Closed,Yes,No,2331270,
1025007,01-04-2017,Mortgage,Conventional fixed mortgage,"Application, originator, mortgage broker",,I was contacted on XX/XX/XXXX email by XXXX fr...,,"Caliber Home Loans, Inc.",FL,336XX,,Consent provided,Web,01-04-2017,Closed with explanation,Yes,No,2274241,


In [5]:
def clean (text):
    
    # Remove punctuation
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ')

    # Lower case
    lowercased = text.lower()
    
    # Tokenize
    tokenized = word_tokenize(lowercased)
    
    # Remove numbers
    words_only = [word for word in tokenized if word.isalpha()]
    
    # Stop words
    stop_words = set(stopwords.words('english'))
    without_stopwords = [word for word in words_only if not word in stop_words]
    
    # Lemmatize
    lemma=WordNetLemmatizer()
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords]
    
    return lemmatized

In [6]:
X = data['Consumer Complaint']
y = data['Product']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
X_train = pd.DataFrame(X_train)
X_train['clean_text'] = X_train['Consumer Complaint'].apply(clean)
X_train['clean_text'] = X_train['clean_text'].astype('str')

X_train

Unnamed: 0,Consumer Complaint,clean_text
582439,Capital Accounts has called me twice now after...,"['capital', 'account', 'called', 'twice', 'tol..."
787004,My ex wife was awarded the property in our div...,"['ex', 'wife', 'awarded', 'property', 'divorce..."
64918,Wells Fargo Financial National Bank charged me...,"['well', 'fargo', 'financial', 'national', 'ba..."
833429,"As of today XXXX XXXX XXXX , XXXX sho...","['today', 'xxxx', 'xxxx', 'xxxx', 'xxxx', 'sho..."
914846,I previously disputed a reporting error with E...,"['previously', 'disputed', 'reporting', 'error..."
...,...,...
538818,I am XXXX XXXX XXXX that works for a company t...,"['xxxx', 'xxxx', 'xxxx', 'work', 'company', 'h..."
983846,Credit card account does not belong to me from...,"['credit', 'card', 'account', 'belong', 'well'..."
596440,"Experian continues to list "" XXXX XXXX '' on m...","['experian', 'continues', 'list', 'xxxx', 'xxx..."
668372,This company sent me an email stating they hav...,"['company', 'sent', 'email', 'stating', 'tried..."


In [10]:
clf = MultinomialNB()
vec = CountVectorizer(min_df=1)

X_train_transformed = vec.fit_transform(X_train['clean_text'])

clf.fit(X_train_transformed, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
cv_nb = cross_validate(clf, X_train_transformed, y_train, scoring = "accuracy")

cv_nb['test_score'].mean()

0.6836006124586993

In [13]:
X_test = pd.DataFrame(X_test)
X_test['clean_text'] = X_test['Consumer Complaint'].apply(clean)
X_test['clean_text'] = X_test['clean_text'].astype('str')

X_test

Unnamed: 0,Consumer Complaint,clean_text
787985,On XX/XX/XXXX at a XXXX ATM a withdrawal trans...,"['xx', 'xx', 'xxxx', 'xxxx', 'atm', 'withdrawa..."
474133,I have a 7 day grace period ; however I receiv...,"['day', 'grace', 'period', 'however', 'receive..."
687653,"On XX/XX/XXXX a "" Small Balance Adjustment '' ...","['xx', 'xx', 'xxxx', 'small', 'balance', 'adju..."
404975,Closed American Express account a few months a...,"['closed', 'american', 'express', 'account', '..."
223405,"To whom it may concern : Years ago, I was a cu...","['may', 'concern', 'year', 'ago', 'customer', ..."
...,...,...
676633,"Hi, My name if XXXX and I reside at XXXX MD. X...","['hi', 'name', 'xxxx', 'reside', 'xxxx', 'md',..."
958677,"Due to Hurricane Irma, I sustained economic lo...","['due', 'hurricane', 'irma', 'sustained', 'eco..."
931492,On XXXX XXXX I received notification from XXXX...,"['xxxx', 'xxxx', 'received', 'notification', '..."
418392,My husband and I have received several voicema...,"['husband', 'received', 'several', 'voicemail'..."


In [14]:
X_test_transformed = vec.transform(X_test['clean_text'])
y_pred = clf.predict(X_test_transformed)

In [15]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6843006577296873


In [16]:
clf.classes_

array(['Bank account or service', 'Checking or savings account',
       'Consumer Loan', 'Credit card', 'Credit card or prepaid card',
       'Credit reporting',
       'Credit reporting, credit repair services, or other personal consumer reports',
       'Debt collection',
       'Money transfer, virtual currency, or money service',
       'Money transfers', 'Mortgage', 'Other financial service',
       'Payday loan', 'Payday loan, title loan, or personal loan',
       'Prepaid card', 'Student loan', 'Vehicle loan or lease',
       'Virtual currency'], dtype='<U76')

In [17]:
clf.predict_proba(X_test_transformed)

array([[8.58063034e-001, 1.41936941e-001, 9.98128288e-020, ...,
        3.12436641e-029, 3.41265054e-024, 3.01085107e-118],
       [7.89335794e-010, 1.53644698e-012, 1.37130865e-005, ...,
        1.57534543e-009, 1.06551293e-012, 2.25223723e-075],
       [3.21120960e-005, 1.86666366e-005, 1.59666312e-018, ...,
        1.02375438e-020, 4.58868363e-025, 1.68726430e-266],
       ...,
       [3.14232293e-023, 3.21660529e-028, 2.23204623e-015, ...,
        1.21354252e-020, 8.01899996e-021, 4.42254552e-108],
       [4.30228580e-001, 1.15538607e-002, 1.23173886e-002, ...,
        3.52363282e-005, 4.23299223e-007, 1.53586793e-064],
       [3.41776787e-013, 1.16744698e-015, 9.41051496e-010, ...,
        1.45914371e-011, 1.99332704e-013, 2.70216843e-051]])

In [31]:
temp = {'Comments': X_test['Consumer Complaint'], 'True class': y_test, 'Prediction': y_pred}

In [32]:
data_test = pd.DataFrame(temp).reset_index()
data_test.head()

Unnamed: 0,index,Comments,True class,Prediction
0,787985,On XX/XX/XXXX at a XXXX ATM a withdrawal trans...,Bank account or service,Bank account or service
1,474133,I have a 7 day grace period ; however I receiv...,Consumer Loan,Mortgage
2,687653,"On XX/XX/XXXX a "" Small Balance Adjustment '' ...",Bank account or service,Credit card or prepaid card
3,404975,Closed American Express account a few months a...,Credit card,Credit card
4,223405,"To whom it may concern : Years ago, I was a cu...",Bank account or service,Bank account or service


**Criando o novo dataframe incluindo a classe "Other"**

In [41]:
dec_percentage = []

dec_precentage = dec_percentage.append(list(map(max, clf.predict_proba(X_test_transformed))))
list_percentage = [f'{i*100:.1f}%' for i in dec_percentage[0]]

In [46]:
dict_to_df = {'Comments': X_test['Consumer Complaint'], 'Percentage of certainty': list_percentage, 'Decimal': (list(map(max, clf.predict_proba(X_test_transformed)))), 'True class': y_test}

data_final = pd.DataFrame(dict_to_df)
data_final['Prediction'] = np.where(data_final['Decimal'] > 0.70, y_pred, "Other")
data_final

Unnamed: 0,Comments,Percentage of certainty,Decimal,True class,Prediction
787985,On XX/XX/XXXX at a XXXX ATM a withdrawal trans...,85.8%,0.858063,Bank account or service,Bank account or service
474133,I have a 7 day grace period ; however I receiv...,100.0%,0.999857,Consumer Loan,Mortgage
687653,"On XX/XX/XXXX a "" Small Balance Adjustment '' ...",99.7%,0.997028,Bank account or service,Credit card or prepaid card
404975,Closed American Express account a few months a...,90.3%,0.903407,Credit card,Credit card
223405,"To whom it may concern : Years ago, I was a cu...",100.0%,0.999715,Bank account or service,Bank account or service
...,...,...,...,...,...
676633,"Hi, My name if XXXX and I reside at XXXX MD. X...",100.0%,1.000000,Mortgage,Mortgage
958677,"Due to Hurricane Irma, I sustained economic lo...",100.0%,1.000000,Mortgage,Mortgage
931492,On XXXX XXXX I received notification from XXXX...,99.7%,0.996896,"Credit reporting, credit repair services, or o...","Credit reporting, credit repair services, or o..."
418392,My husband and I have received several voicema...,53.5%,0.534964,Consumer Loan,Other
