In [1]:
import pandas as pd

df_selected = pd.read_csv("corpus_balanced3_cleaned_lemmatized.csv", encoding="utf-8")

In [2]:
print(df_selected.shape)
df_selected.head()

(203874, 2)


Unnamed: 0,Product,Consumer complaint narrative
0,"Payday loan, title loan, or personal loan",they would not let me pay my loan off day befo...
1,"Payday loan, title loan, or personal loan",service finance are liar and are charging me i...
2,Checking or savings account,over draft fee due to fraudulent charge submit...
3,Vehicle loan or lease,on i signed a car loan agreement to finance my...
4,"Money transfer, virtual currency, or money ser...",we hired and debt collection to handle collect...


In case of nulls (should be solved now):

In [3]:
df_selected = df_selected.dropna()

---

**Apply CountVectorizer**

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# vect (bag of words)
count_vect = CountVectorizer(
    stop_words="english",
    #ngram_range=(1,2), # bigrammen
    min_df=2, # only keep words that appear twice
    max_df=0.5 # appears max in 50% of documents
)

X_train_counts = count_vect.fit_transform(df_selected["Consumer complaint narrative"])

**Apply TF-IDF**

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer

# Normalise with tf-idf
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print('Shape', X_train_tfidf.shape)
print(X_train_tfidf)

Shape (203870, 43559)
  (0, 42594)	0.16965368354576021
  (0, 42322)	0.08648039714491654
  (0, 40522)	0.10321436568599368
  (0, 39796)	0.24394180967991022
  (0, 39245)	0.05694221274874988
  (0, 37911)	0.19061267595948492
  (0, 37256)	0.1688706414707319
  (0, 35403)	0.15231323416834944
  (0, 34937)	0.0695020149883917
  (0, 29789)	0.12138118686475403
  (0, 29450)	0.2136685604120553
  (0, 29097)	0.14060235203800467
  (0, 28935)	0.11718425759670494
  (0, 28774)	0.10069077732453803
  (0, 28670)	0.20177582857295118
  (0, 28282)	0.19099065925142414
  (0, 27955)	0.21064105622066892
  (0, 27885)	0.18971656646407192
  (0, 25148)	0.09336914288606159
  (0, 25146)	0.14887694055511252
  (0, 25011)	0.24514381296248944
  (0, 24265)	0.06791001159505453
  (0, 23952)	0.27577507579311655
  (0, 23770)	0.12119771808697179
  (0, 22904)	0.06993415415033985
  :	:
  (203869, 19230)	0.20741981395337317
  (203869, 18093)	0.06485070293399212
  (203869, 17036)	0.0996138413253509
  (203869, 16671)	0.11912135120050359

**Split into train & test**

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_train_counts, df_selected['Product'])
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(152902, 43559)
(50968, 43559)
(152902,)
(50968,)


**Dimensionality Reduction**

In [7]:
from sklearn.decomposition import TruncatedSVD

tSVD = TruncatedSVD(n_components=30)

# apply TruncatedSVD
principal_components_train = tSVD.fit_transform(X_train)
principal_components_test = tSVD.transform(X_test)

# data points & their principal components
print(principal_components_train.shape)
print(principal_components_test.shape)

(152902, 30)
(50968, 30)


**Run algorithm**

In [8]:
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report # do more stats

classifier = svm.LinearSVC()
classifier.fit(principal_components_train, Y_train)

test_predictions = classifier.predict(principal_components_test)



**Report**

In [9]:
print(classification_report(test_predictions, Y_test))

                                                                              precision    recall  f1-score   support

                                                     Bank account or service       0.11      0.45      0.17       876
                                                 Checking or savings account       0.75      0.45      0.56      7794
                                                               Consumer Loan       0.04      0.46      0.08       225
                                                                 Credit card       0.36      0.44      0.40      3846
                                                 Credit card or prepaid card       0.40      0.47      0.44      4023
                                                            Credit reporting       0.50      0.42      0.46      5553
Credit reporting, credit repair services, or other personal consumer reports       0.35      0.35      0.35      5388
                                                       