In [1]:
import pandas as pd

df_selected = pd.read_csv("corpus_balanced3_cleaned_lemmatized.csv", encoding="utf-8")
df_selected.head()

Unnamed: 0,Product,Consumer complaint narrative
0,"Credit reporting, credit repair services, or o...",i have complained many time that the credit re...
1,Debt collection,please review the current fraud account and al...
2,Debt collection,called multiple time over the year for a debt ...
3,Debt collection,i sent in a letter to the company to have them...
4,"Credit reporting, credit repair services, or o...",on i applied for a debt relief product from th...


In case of nulls (should be solved now):

In [2]:
df_selected = df_selected.dropna()

---

**Apply CountVectorizer**

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# vect (bag of words)
count_vect = CountVectorizer(
    stop_words="english",
    #ngram_range=(1,2), # bigrammen
    min_df=2, # only keep words that appear twice
    max_df=0.5 # appears max in 50% of documents
)

X_train_counts = count_vect.fit_transform(df_selected["Consumer complaint narrative"])

**Apply TF-IDF**

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer

# Normalise with tf-idf
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print('Shape', X_train_tfidf.shape)
print(X_train_tfidf)

Shape (485688, 71567)
  (0, 71192)	0.05230595616010456
  (0, 70723)	0.16833478998169477
  (0, 69468)	0.0719075584637063
  (0, 69258)	0.06503017384693298
  (0, 69160)	0.09531042729330397
  (0, 68017)	0.12124308114444977
  (0, 66290)	0.18586637900118472
  (0, 64339)	0.12320350827353503
  (0, 64131)	0.042060110277500326
  (0, 63853)	0.11539433841016501
  (0, 63654)	0.07951149797926328
  (0, 61778)	0.09707771835299159
  (0, 61535)	0.14217081952709348
  (0, 59978)	0.1052462673028251
  (0, 58490)	0.12505200999905375
  (0, 57754)	0.057482791458895045
  (0, 57740)	0.1264306013900411
  (0, 57309)	0.11466435910050997
  (0, 56779)	0.07124095663790687
  (0, 55054)	0.09331491070099641
  (0, 55044)	0.09882442354430733
  (0, 53961)	0.05435146164625784
  (0, 53859)	0.039928334054282255
  (0, 53163)	0.14878862661034029
  (0, 52419)	0.14608657350753182
  :	:
  (485687, 69160)	0.1830055859006121
  (485687, 66962)	0.2097008105134163
  (485687, 65593)	0.2667716011246174
  (485687, 64570)	0.0932600982486483

**Split into train & test**

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_train_counts, df_selected['Product'])
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(364266, 71567)
(121422, 71567)
(364266,)
(121422,)


**Run algorithm**

In [6]:
from sklearn.decomposition import TruncatedSVD

tSVD = TruncatedSVD(n_components=30)

# apply TruncatedSVD
principal_components_train = tSVD.fit_transform(X_train)
principal_components_test = tSVD.transform(X_test)

# data points & their principal components
print(principal_components_train.shape)
print(principal_components_test.shape)

(364266, 30)
(121422, 30)


**Report**

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report # do more stats

clf = RandomForestClassifier()
clf.fit(principal_components_train, Y_train)

test_predictions = clf.predict(principal_components_test)
print(classification_report(test_predictions, Y_test))

                                                                              precision    recall  f1-score   support

                                                     Bank account or service       0.24      0.38      0.30      2393
                                                 Checking or savings account       0.56      0.44      0.50      6024
                                                               Consumer Loan       0.06      0.38      0.10       358
                                                                 Credit card       0.11      0.41      0.17      1246
                                                 Credit card or prepaid card       0.62      0.47      0.54     10286
                                                            Credit reporting       0.23      0.85      0.36      2122
Credit reporting, credit repair services, or other personal consumer reports       0.90      0.71      0.79     44453
                                                       