In [None]:
import pandas as pd

df_selected = pd.read_pickle('corpus.pkl')

**Data Cleaning**

In [None]:
import re
import string

def clean_url(complaint):
    # to do: more regex url garbage matching
    complaint = re.sub('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', complaint)
    complaint = re.sub('https? ?: ?// ?(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', complaint)
    return complaint

# Remove punctuation from complaint
def clean_punctuation(complaint):
    complaint = re.sub('[%s]' % re.escape(string.punctuation), '', complaint)
    return complaint

# Remove non-sensical characters from complaint
def clean_nonsense(complaint):
    complaint = re.sub('[''""...]', '', complaint)
    complaint = re.sub('\n', '', complaint)
    return complaint

# Remove censored words from complaint
def clean_censored(complaint):
    complaint = re.sub('[XXXX]', '', complaint)
    return complaint

# Turn everything into lowercase
def clean_lowercase(complaint):
    complaint = complaint.lower()
    return complaint

# Remove numbers from complaint
def clean_numbers(complaint):
    complaint = re.sub('\w*\d\w', '', complaint)
    return complaint

In [None]:
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_url)
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_punctuation)
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_nonsense)
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_censored)
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_lowercase)
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_numbers)

**Apply CountVectorizer**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# vect (bag of words)
count_vect = CountVectorizer(
    stop_words="english",
    #ngram_range=(1,2), # bigrammen
    min_df=2, # only keep words that appear twice
    max_df=0.5 # appears max in 50% of documents
)

X_train_counts = count_vect.fit_transform(df_selected["Consumer complaint narrative"])

**Split into train & test**

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_train_counts, df_selected['Product'])
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(364275, 77541)
(121426, 77541)
(364275,)
(121426,)


**Dimensionality Reduction**

In [24]:
from sklearn.decomposition import TruncatedSVD

tSVD = TruncatedSVD(n_components=5)

# apply PCA
principal_components_train = tSVD.fit_transform(X_train)
principal_components_test = tSVD.transform(X_test)

# data points & their principal components
print(principal_components_train.shape)
print(principal_components_test.shape)

(364275, 5)
(121426, 5)


**Run algorithm and report**

In [27]:
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report # do more stats

# classifier = svm.LinearSVC()
# classifier.fit(principal_components_train, Y_train)

# test_predictions = classifier.predict(principal_components_test)
# print(classification_report(test_predictions, Y_test))

In [28]:
classifier = svm.SVC(kernel='rbf')
classifier.fit(principal_components_train, Y_train)

test_predictions = classifier.predict(principal_components_test)
print(classification_report(test_predictions, Y_test))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                                                              precision    recall  f1-score   support

                                                     Bank account or service       0.00      0.35      0.01        34
                                                 Checking or savings account       0.46      0.34      0.39      6489
                                                               Consumer Loan       0.00      0.00      0.00         0
                                                                 Credit card       0.00      0.00      0.00         4
                                                 Credit card or prepaid card       0.43      0.30      0.36     11289
                                                            Credit reporting       0.00      0.29      0.00        17
Credit reporting, credit repair services, or other personal consumer reports       0.81      0.53      0.64     54159
                                                       

  _warn_prf(average, modifier, msg_start, len(result))
