In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# import the dataset and prepare the text and labels
import pandas as pd
df = pd.read_csv("version2.csv")
df.head()

texts = df.narratives.tolist()
target = df.label.tolist()

Unnamed: 0.1,Unnamed: 0,company,company_public_response,company_response,complaint_id,narratives,consumer_consent_provided,consumer_disputed,date_received,date_sent_to_company,...,state,sub_issue,sub_product,submitted_via,tags,timely,zip_code,Qtr,product_new,label
0,0,Experian Information Solutions Inc.,Company has responded to the consumer and the ...,Closed with explanation,2788944,There are many mistakes appear in my report wi...,Consent provided,,2018-01-20,2018-01-20,...,MD,Account information incorrect,Credit reporting,Web,,Yes,212XX,2018Q1,"Credit reporting, credit repair services, or o...","('Credit reporting, credit repair services, or..."
1,1,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",Company has responded to the consumer and the ...,Closed with explanation,2773990,I was pulling my credit to looking into buying...,Consent provided,,2018-01-06,2018-01-06,...,FL,Information belongs to someone else,Credit reporting,Web,,Yes,347XX,2018Q1,"Credit reporting, credit repair services, or o...","('Credit reporting, credit repair services, or..."
2,2,"Caliber Home Loans, Inc.",,Closed with explanation,2737995,"Since XXXX XXXX, I was working with a loan con...",Consent provided,,2017-11-26,2017-11-26,...,MD,,Conventional home mortgage,Web,,Yes,208XX,2017Q4,Mortgage,"('Mortgage', 'apply for mortgage')"
3,3,"Select Management Resources, LLC",Company believes it acted appropriately as aut...,Closed with non-monetary relief,2674806,On XX/XX/XXXX I went to NTL in Delaware for a ...,Consent provided,,2017-09-15,2017-09-15,...,PA,,Title loan,Web,,Yes,190XX,2017Q3,Loan,"('loan', 'payment')"
4,4,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",Company has responded to the consumer and the ...,Closed with non-monetary relief,2683460,"Hello CFPB, I am sending this complaint to you...",Consent provided,,2017-09-25,2017-09-25,...,MD,Their investigation did not fix an error on yo...,Credit reporting,Web,,Yes,212XX,2017Q3,"Credit reporting, credit repair services, or o...","('Credit reporting, credit repair services, or..."


### 1. Text for tf-idf: tokenized only

In [3]:
import nltk, re, string
import numpy as np

newtexts = []
for text in texts:
    tokens = nltk.word_tokenize(text.lower())
    newtext = " ".join(tokens)
    newtexts.append(newtext)

newtexts[0:5]

['there are many mistakes appear in my report without my understanding .',
 'i was pulling my credit to looking into buying a home this year and i seen that there was quite a few unauthorized accounts on my trans union , xxxx , xxxx reports . i have not access my credit file since xxxx or xxxx and it seem as if someone use my personal information to access utilities and cable bills . i do not recognize these accounts that are listed on my credit file xxxx xxxx xxxx # xxxx , xxxx xxxx xxxx , xxxx xxxx xxxx # xxxx , xxxx xxxx xxxx ... i no longer live at the follow addresses i have been the victim of id theft so please remove the following address : xxxx xxxx xxxx xxxx , xxxx , oh xxxx xxxx xxxx xxxx xxxx , xxxx , oh xxxx xxxx xxxx xxxx xxxx xxxx xxxx , xxxx , fl xxxx xxxx xxxx xxxx xxxx xxxx xxxx , xxxx xxxx , fl xxxx xxxx xxxx xxxx xxxx , xxxx , oh xxxx xxxx/xxxx/xxxx xxxx xxxx xxxx , xxxx , oh xxxx xxxx xxxx xxxx , xxxx , oh xxxx xxxx xxxx xxxx xxxx , xxxx xxxx oh xxxx xxxx xxxx xxxx 

- token amount: 53779

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer() 
tf_idf = tfidf_vect.fit_transform(newtexts)

type(tf_idf)
tf_idf.shape

scipy.sparse.csr.csr_matrix

(87592, 53779)

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# split dataset into train (70%) and test sets (30%)
X_train, X_test, y_train, y_test = train_test_split(\
                tf_idf, target, test_size=0.3, random_state=0, stratify=target)

# train a multinomial naive Bayes model using the testing data
clf = MultinomialNB().fit(X_train, y_train)

# predict the news group for the test dataset
predicted=clf.predict(X_test)

# get the list of unique labels
labels=sorted(list(set(target)))

# calculate performance metrics
precision, recall, fscore, support=\
     precision_recall_fscore_support(\
     y_test, predicted, labels=labels)
    
print(classification_report\
      (y_test, predicted, target_names=labels))

print("Accuracy:")
print(accuracy_score(y_test, predicted))

  'precision', 'predicted', average, warn_for)


                                                                                                                                                                      precision    recall  f1-score   support

                                                                                                               ('Checking or savings account', 'Closing an account')       0.00      0.00      0.00       198
                                                                                                              ('Checking or savings account', 'Managing an account')       0.45      0.02      0.03       848
                                                                                                               ('Checking or savings account', 'Opening an account')       0.00      0.00      0.00       181
                                                                                           ('Checking or savings account', 'Problem caused by your funds being low')       0.00

0.231296141259


In [None]:
# tune parameters using grid search

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


# build a pipeline to generate the tf-idf matrix and train the classifer all together
text_clf = Pipeline([('tfidf', TfidfVectorizer()),('clf', MultinomialNB())])


# set the range of parameters to be tuned
parameters = {'tfidf__min_df':[1,2,5,10],'clf__alpha': [0.1,0.2,0.5,1.0],}

# set the matrix to select the best parameter
metric = "f1_macro"


# conduct the grid search

gs_clf = GridSearchCV(text_clf, param_grid = parameters, scoring = metric, cv=6)
gs_clf = gs_clf.fit(texts, target)

for param_name in gs_clf.best_params_:
    print(param_name,": ",gs_clf.best_params_[param_name])
    
print("best f1 score:", gs_clf.best_score_)


### tune parameter: min_df=10, alpha=0.1 (then fixed)

- token amount: 12697

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(min_df=10) 
tf_idf = tfidf_vect.fit_transform(newtexts)

type(tf_idf)
tf_idf.shape

scipy.sparse.csr.csr_matrix

(87592, 12697)

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# split dataset into train (70%) and test sets (30%)
X_train, X_test, y_train, y_test = train_test_split(\
                tf_idf, target, test_size=0.3, random_state=0, stratify=target)

# train a multinomial naive Bayes model using the testing data
clf = MultinomialNB(alpha=0.1).fit(X_train, y_train)

# predict the news group for the test dataset
predicted=clf.predict(X_test)

# get the list of unique labels
labels=sorted(list(set(target)))

# calculate performance metrics
precision, recall, fscore, support=\
     precision_recall_fscore_support(\
     y_test, predicted, labels=labels)
    
print(classification_report\
      (y_test, predicted, target_names=labels))

print("Accuracy:")
print(accuracy_score(y_test, predicted))

  'precision', 'predicted', average, warn_for)


                                                                                                                                                                      precision    recall  f1-score   support

                                                                                                               ('Checking or savings account', 'Closing an account')       1.00      0.03      0.05       198
                                                                                                              ('Checking or savings account', 'Managing an account')       0.38      0.80      0.51       848
                                                                                                               ('Checking or savings account', 'Opening an account')       0.72      0.29      0.42       181
                                                                                           ('Checking or savings account', 'Problem caused by your funds being low')       0.65

0.519826470812


### 2. Text for tf-idf: tokenize + stopwords & punctuation removed

In [9]:
import nltk, re, string
import numpy as np
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words += ["xx", "xxxx"]

newtexts = []
for text in texts:
    tokens = nltk.word_tokenize(text.lower())
    text_tokens = [token.strip() for token in tokens \
                   if token.strip() not in stop_words \
                   and token.strip() not in string.punctuation]
    newtext = " ".join(text_tokens)
    newtexts.append(newtext)
    
newtexts[0:5]

['many mistakes appear report without understanding',
 'pulling credit looking buying home year seen quite unauthorized accounts trans union reports access credit file since seem someone use personal information access utilities cable bills recognize accounts listed credit file ... longer live follow addresses victim id theft please remove following address oh oh fl fl oh xxxx/xxxx/xxxx oh oh oh oh oh fl sq oh oh xxxx/xxxx/xxxx oh oh',
 "since working loan consultant caliber home loans looking purchase new home 've worked past trusted would help obtain new home loan previously presenting issue nine months caliber home loans used deceptive predatory lending practices attempt obtain home loan behalf knew would qualify get explain learned reputable lender qualified purchase home loan 300000.00 9 months caliber homes provided pre-approvals 4 different home range 400000.00 640000.00 knew way capable obtaining loan range knowingly continued deceive knowing selling current home rent new home 

- token amount: 12673

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(min_df=10) 
tf_idf = tfidf_vect.fit_transform(newtexts)

type(tf_idf)
tf_idf.shape

scipy.sparse.csr.csr_matrix

(87592, 12673)

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# split dataset into train (70%) and test sets (30%)
X_train, X_test, y_train, y_test = train_test_split(\
                tf_idf, target, test_size=0.3, random_state=0, stratify=target)

# train a multinomial naive Bayes model using the testing data
clf = MultinomialNB(alpha=0.1).fit(X_train, y_train)

# predict the news group for the test dataset
predicted=clf.predict(X_test)

# get the list of unique labels
labels=sorted(list(set(target)))

# calculate performance metrics
precision, recall, fscore, support=\
     precision_recall_fscore_support(\
     y_test, predicted, labels=labels)
    
print(classification_report\
      (y_test, predicted, target_names=labels))

print("Accuracy:")
print(accuracy_score(y_test, predicted))

  'precision', 'predicted', average, warn_for)


                                                                                                                                                                      precision    recall  f1-score   support

                                                                                                               ('Checking or savings account', 'Closing an account')       0.70      0.04      0.07       198
                                                                                                              ('Checking or savings account', 'Managing an account')       0.38      0.79      0.52       848
                                                                                                               ('Checking or savings account', 'Opening an account')       0.69      0.33      0.44       181
                                                                                           ('Checking or savings account', 'Problem caused by your funds being low')       0.57

0.53192784839


### 3. Text for tf-idf: tokenize + stopwords & punctuation removed + non-English words removed

In [12]:
import nltk, re, string
import numpy as np
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words += ["xx", "xxxx"]
words = set(nltk.corpus.words.words())
    
newtexts = []
for text in texts:
    tokens = nltk.word_tokenize(text.lower())
    text_tokens = [token.strip() for token in tokens \
                   if token.strip() not in stop_words \
                   and token.strip() not in string.punctuation \
                   and token.strip() in words]
    newtext = " ".join(text_tokens)
    newtexts.append(newtext)
    
newtexts[0:5]

['many appear report without understanding',
 'credit looking home year seen quite unauthorized union access credit file since seem someone use personal information access cable recognize listed credit file longer live follow victim id theft please remove following address oh oh oh oh oh oh oh oh oh oh oh oh',
 'since working loan consultant caliber home looking purchase new home worked past would help obtain new home loan previously issue nine caliber home used deceptive predatory attempt obtain home loan behalf knew would qualify get explain learned reputable lender qualified purchase home loan caliber provided different home range knew way capable loan range knowingly continued deceive knowing selling current home rent new home provided preapproval purchase knowing made admitted omission ran credit twice go forward previous home sale sale last house knew would lose deposit home afford purchase never would able buy access since took loss purchase old house lost deposit new home movin

- token amount: 6923

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(min_df=10) 
tf_idf = tfidf_vect.fit_transform(newtexts)

type(tf_idf)
tf_idf.shape

scipy.sparse.csr.csr_matrix

(87592, 6923)

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# split dataset into train (70%) and test sets (30%)
X_train, X_test, y_train, y_test = train_test_split(\
                tf_idf, target, test_size=0.3, random_state=0, stratify=target)

# train a multinomial naive Bayes model using the testing data
clf = MultinomialNB(alpha=0.1).fit(X_train, y_train)

# predict the news group for the test dataset
predicted=clf.predict(X_test)

# get the list of unique labels
labels=sorted(list(set(target)))

# calculate performance metrics
precision, recall, fscore, support=\
     precision_recall_fscore_support(\
     y_test, predicted, labels=labels)
    
print(classification_report\
      (y_test, predicted, target_names=labels))

print("Accuracy:")
print(accuracy_score(y_test, predicted))

  'precision', 'predicted', average, warn_for)


                                                                                                                                                                      precision    recall  f1-score   support

                                                                                                               ('Checking or savings account', 'Closing an account')       0.64      0.04      0.07       198
                                                                                                              ('Checking or savings account', 'Managing an account')       0.36      0.72      0.48       848
                                                                                                               ('Checking or savings account', 'Opening an account')       0.69      0.35      0.46       181
                                                                                           ('Checking or savings account', 'Problem caused by your funds being low')       0.68

0.496460917878


### 4. Text for tf-idf: tokenize + stopwords & punctuation removed + non-English words removed + tokens stemmed

In [15]:
import nltk, re, string
import numpy as np
from sklearn.preprocessing import normalize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

stop_words = stopwords.words('english')
stop_words += ["xx", "xxxx"]
words = set(nltk.corpus.words.words())

newtexts = []
for text in texts:
    tokens = nltk.word_tokenize(text.lower())
    text_tokens = [token.strip() for token in tokens \
                   if token.strip() not in stop_words \
                   and token.strip() not in string.punctuation \
                   and token.strip() in words]
    porter_stemmer = PorterStemmer()
    stem_text_tokens = [porter_stemmer.stem(text_token) for text_token in text_tokens]
    newtext = " ".join(stem_text_tokens)
    newtexts.append(newtext)
    
newtexts[0:5]

['mani appear report without understand',
 'credit look home year seen quit unauthor union access credit file sinc seem someon use person inform access cabl recogn list credit file longer live follow victim id theft pleas remov follow address oh oh oh oh oh oh oh oh oh oh oh oh',
 'sinc work loan consult calib home look purchas new home work past would help obtain new home loan previous issu nine calib home use decept predatori attempt obtain home loan behalf knew would qualifi get explain learn reput lender qualifi purchas home loan calib provid differ home rang knew way capabl loan rang knowingli continu deceiv know sell current home rent new home provid preapprov purchas know made admit omiss ran credit twice go forward previou home sale sale last hous knew would lose deposit home afford purchas never would abl buy access sinc took loss purchas old hous lost deposit new home move unfortun owner hous forc vacat also impact hous market decept omiss addit stop correspond final told tru

- token amount: 5210

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(min_df=10) 
tf_idf = tfidf_vect.fit_transform(newtexts)

type(tf_idf)
tf_idf.shape

scipy.sparse.csr.csr_matrix

(87592, 5210)

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# split dataset into train (70%) and test sets (30%)
X_train, X_test, y_train, y_test = train_test_split(\
                tf_idf, target, test_size=0.3, random_state=0, stratify=target)

# train a multinomial naive Bayes model using the testing data
clf = MultinomialNB(alpha=0.1).fit(X_train, y_train)

# predict the news group for the test dataset
predicted=clf.predict(X_test)

# get the list of unique labels
labels=sorted(list(set(target)))

# calculate performance metrics
precision, recall, fscore, support=\
     precision_recall_fscore_support(\
     y_test, predicted, labels=labels)
    
print(classification_report\
      (y_test, predicted, target_names=labels))

print("Accuracy:")
print(accuracy_score(y_test, predicted))

  'precision', 'predicted', average, warn_for)


                                                                                                                                                                      precision    recall  f1-score   support

                                                                                                               ('Checking or savings account', 'Closing an account')       0.57      0.04      0.08       198
                                                                                                              ('Checking or savings account', 'Managing an account')       0.37      0.71      0.48       848
                                                                                                               ('Checking or savings account', 'Opening an account')       0.74      0.35      0.47       181
                                                                                           ('Checking or savings account', 'Problem caused by your funds being low')       0.62

0.4908288302


### Conclusion: 
### Removing stopwords (plus "xx" "xxxx") and punctuations can lead to the best performance of the  Naive Bayes model