In [11]:
import pandas as pd

df_selected = pd.read_csv("../lemmatizers/corpus_lemmatized_cleaned_once.csv", encoding="utf-8")

In [12]:
print(df_selected.shape)
df_selected.head()

(287475, 3)


Unnamed: 0,Product,Consumer complaint narrative,Issue
0,"Payday loan, title loan, or personal loan",they would not let me pay my loan off day befo...,Problem with the payoff process at the end of ...
1,"Payday loan, title loan, or personal loan",service finance are liar and are charging me i...,Charged fees or interest you didn't expect
2,Credit card or prepaid card,re amex card ending dispute were done in time ...,Fees or interest
3,Checking or savings account,over draft fee due to fraudulent charge submit...,Problem caused by your funds being low
4,Student loan,i took out a loan to go to school total of in ...,Dealing with your lender or servicer


In case of nulls (should be solved now):

In [13]:
df_selected = df_selected.dropna()

---

__Functions to clean the Consumer complaint narrative__

In [14]:
import re
import string

def clean_url(complaint):
    # to do: more regex url garbage matching
    complaint = re.sub('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', complaint)
    complaint = re.sub('https? ?: ?// ?(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', complaint)
    return complaint

# Remove punctuation from complaint
def clean_punctuation(complaint):
    complaint = re.sub('[%s]' % re.escape(string.punctuation), '', complaint)
    return complaint

# Remove non-sensical characters from complaint
def clean_nonsense(complaint):
    complaint = re.sub('[''""...]', '', complaint)
    complaint = re.sub('\n', '', complaint)
    return complaint

# Remove censored words from complaint
def clean_censored(complaint):
    complaint = re.sub('[XXXX]', '', complaint)
    return complaint

# Turn everything into lowercase
def clean_lowercase(complaint):
    complaint = complaint.lower()
    return complaint

# Remove numbers from complaint
def clean_numbers(complaint):
    complaint = re.sub('\w*\d\w', '', complaint)
    return complaint

__Application of narrative cleaning__

In [15]:
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_url)
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_punctuation)
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_nonsense)
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_censored)
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_lowercase)
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_numbers)

**Apply CountVectorizer**

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

# vect (bag of words)
count_vect = CountVectorizer(
    stop_words="english",
    #ngram_range=(1,2), # bigrammen
    min_df=2, # only keep words that appear twice
    max_df=0.5 # appears max in 50% of documents
)

X_train_counts = count_vect.fit_transform(df_selected["Consumer complaint narrative"])

**Apply TF-IDF**

In [17]:
from sklearn.feature_extraction.text import TfidfTransformer

# Normalise with tf-idf
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print('Shape', X_train_tfidf.shape)
print(X_train_tfidf)

Shape (287473, 52953)
  (0, 51756)	0.1674878820082747
  (0, 51399)	0.08693620815421989
  (0, 49181)	0.1034536186470629
  (0, 48320)	0.24770225383825142
  (0, 47678)	0.057776090444155874
  (0, 46041)	0.19139846735564733
  (0, 45275)	0.16852306930923108
  (0, 43077)	0.15257147076923896
  (0, 42529)	0.06966303529690976
  (0, 36213)	0.12103340180406812
  (0, 35816)	0.2149129906662757
  (0, 35382)	0.14059267172188622
  (0, 35188)	0.11727101016148452
  (0, 35004)	0.09955709292100885
  (0, 34885)	0.20123127127154886
  (0, 34416)	0.19089065628393373
  (0, 34023)	0.2126757116879758
  (0, 33947)	0.19140492487067767
  (0, 30568)	0.0935519791572946
  (0, 30565)	0.14772993663635492
  (0, 30407)	0.24315797873130784
  (0, 29505)	0.07026079910295367
  (0, 29123)	0.2735185105058723
  (0, 28914)	0.12118190880579335
  (0, 27880)	0.0706123048158684
  :	:
  (287472, 20610)	0.06085635445959746
  (287472, 20407)	0.05916585515483789
  (287472, 20348)	0.10741364390630888
  (287472, 20287)	0.060575135700238016


**Dimensionality Reduction**

In [18]:
from sklearn.decomposition import TruncatedSVD

tSVD = TruncatedSVD(n_components=20)

principal_components = tSVD.fit_transform(X_train_tfidf)
print(principal_components.shape)

(287473, 20)


__One hot encoding of extra columns__

In [19]:
principal_components_df = pd.DataFrame(principal_components)

new_df = pd.Categorical(df_selected['Issue'])
df_dummies = pd.get_dummies(new_df)

df_concat = pd.concat([principal_components_df, df_dummies], axis = 1)
print(df_concat)

               0         1         2         3         4         5         6  \
0       0.241412 -0.186882  0.109470 -0.010186 -0.011288  0.008090  0.029554   
1       0.222577 -0.080115  0.009368 -0.030870 -0.055723 -0.057052 -0.025391   
2       0.109440 -0.021436 -0.060602  0.002063  0.007531  0.030237 -0.000774   
3       0.122730 -0.021649 -0.104533  0.014870 -0.037871 -0.088083 -0.016706   
4       0.269787 -0.187078  0.235362  0.104778 -0.210958  0.107831 -0.046290   
5       0.443205 -0.253777  0.121944 -0.145414  0.187469 -0.143329  0.036675   
6       0.243051 -0.001269 -0.095841  0.144299  0.016477 -0.103414  0.075885   
7       0.136003 -0.061686 -0.157833 -0.044922  0.063358  0.235091 -0.057778   
8       0.108116 -0.056557  0.011018  0.034978 -0.028569  0.021519 -0.004532   
9       0.118113  0.012160  0.051335 -0.045335  0.036157 -0.016990 -0.027833   
10      0.150571 -0.042486  0.078873  0.029270 -0.015608 -0.002198 -0.014248   
11      0.100102 -0.001973 -0.028954  0.

**Split into train & test**

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df_concat, df_selected['Product'])
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(215604, 169)
(71869, 169)
(215604,)
(71869,)


**Run algorithm**

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report # do more stats

clf = RandomForestClassifier()
clf.fit(X_train, Y_train)

test_predictions = clf.predict(X_test)

**Report**

In [22]:
print(classification_report(test_predictions, Y_test))

                                                                              precision    recall  f1-score   support

                                                     Bank account or service       0.99      1.00      0.99      3691
                                                 Checking or savings account       0.99      1.00      0.99      4671
                                                               Consumer Loan       0.88      0.80      0.84      2597
                                                                 Credit card       0.97      0.94      0.95      4772
                                                 Credit card or prepaid card       0.95      0.98      0.96      7744
                                                            Credit reporting       1.00      1.00      1.00      7883
Credit reporting, credit repair services, or other personal consumer reports       0.98      0.92      0.95     11543
                                                       