In all versions 4.x the narrative gets cleaned as before and products are cleaned twice times. The first time half of the three most occuring entries in __Product__ are randomly removed. The second time , the components that almost never occur are removed. This run uses 5 different principle components. 

__Initialing the data__

In [1]:
import pandas as pd

df_selected1 = pd.read_csv("../balanced_data/corpus_balanced_cleaned_lemmatized_first_cut_scarce_elimination.csv", encoding="utf-8")

print("nulls in df_selected:", df_selected1["Consumer complaint narrative"].isnull().sum())
df_selected = df_selected1.dropna()

nulls in df_selected: 0


__Initializing the bag of words__

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# vect (bag of words)
count_vect = CountVectorizer(
    stop_words="english",
    #ngram_range=(1,2), # bigrammen
    min_df=2, # only keep words that appear twice
    max_df=0.5 # appears max in 50% of documents
)

X_train_counts = count_vect.fit_transform(df_selected["Consumer complaint narrative"])

__Principal components are chosen__

In [3]:
from sklearn.decomposition import TruncatedSVD

tSVD = TruncatedSVD(n_components=5)

principal_components = tSVD.fit_transform(X_train_counts)
print(principal_components.shape)

(242494, 5)


In [4]:
print(principal_components)
principal_components_df = pd.DataFrame(principal_components)
print(principal_components_df)

[[ 4.26616418 -3.3707761   0.01891819  1.00721228  0.62388963]
 [ 3.96354958 -1.09950572  0.72599931 -0.56177672  0.33559394]
 [14.47191765  7.74448078 -1.5026979   1.56886974  0.64794075]
 ...
 [ 0.72788146  0.60669373 -0.70435362 -0.32203655 -0.47626891]
 [ 5.77501736  4.62786666 -5.6368094  -3.37447198 -4.59749694]
 [ 6.52529348  1.60234284  0.53302023 -1.23424193 -0.4389109 ]]
                0         1         2         3         4
0        4.266164 -3.370776  0.018918  1.007212  0.623890
1        3.963550 -1.099506  0.725999 -0.561777  0.335594
2       14.471918  7.744481 -1.502698  1.568870  0.647941
3        2.018344  0.900113  2.367260 -0.566600 -0.508107
4        4.969348 -4.724731 -1.145126 -3.919982  3.563143
...           ...       ...       ...       ...       ...
242489   4.176435 -0.166879  0.600370 -0.478135 -1.715241
242490   2.560838  1.935495 -2.688079 -1.540338 -2.619397
242491   0.727881  0.606694 -0.704354 -0.322037 -0.476269
242492   5.775017  4.627867 -5.63680

__One hot encoding of extra columns__

In [5]:
#One hot encoding
issue_df = pd.Categorical(df_selected['Issue'])

df_dummies = pd.get_dummies(issue_df, prefix = 'issue')

#X_train_counts_df = pd.DataFrame(X_train_counts)
#print("DF conversion done")

print(principal_components_df.shape)
print(df_dummies.shape)
df_concat = pd.concat([principal_components_df, df_dummies], axis = 1)
print(df_concat.shape)


(242494, 5)
(242494, 146)
(242494, 151)


__Split into train and test__

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df_concat, df_selected['Product'])
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(181870, 151)
(60624, 151)
(181870,)
(60624,)


__Application of linear SVM__

In [7]:
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report # do more stats

classifier = svm.LinearSVC()
classifier.fit(X_train, Y_train)

test_predictions = classifier.predict(X_test)
print(classification_report(test_predictions, Y_test))



                                                    precision    recall  f1-score   support

                           Bank account or service       1.00      1.00      1.00      3656
                       Checking or savings account       0.99      1.00      1.00      4628
                                     Consumer Loan       0.96      0.78      0.86      3031
                                       Credit card       1.00      1.00      1.00      4706
                       Credit card or prepaid card       1.00      0.93      0.97      8417
                                  Credit reporting       1.00      1.00      1.00      7882
                                   Debt collection       1.00      1.00      1.00      9999
Money transfer, virtual currency, or money service       0.98      1.00      0.99      1847
                                          Mortgage       0.99      1.00      0.99      7642
         Payday loan, title loan, or personal loan       0.71      0.97      0.