In all versions 1.x no data gets cleaned except for the narrative. This run uses 20 different principle components.

__Initialing the data__

In [6]:
import pandas as pd

df = pd.read_csv("../../../complaints-2020-01-22_08_24.csv", encoding="utf-8")

df_selected = df.loc[:, ('Product', 'Consumer complaint narrative', 'Issue')]

__Functions to clean the Consumer complaint narrative__

In [7]:
import re
import string

def clean_url(complaint):
    # to do: more regex url garbage matching
    complaint = re.sub('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', complaint)
    complaint = re.sub('https? ?: ?// ?(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', complaint)
    return complaint

# Remove punctuation from complaint
def clean_punctuation(complaint):
    complaint = re.sub('[%s]' % re.escape(string.punctuation), '', complaint)
    return complaint

# Remove non-sensical characters from complaint
def clean_nonsense(complaint):
    complaint = re.sub('[''""...]', '', complaint)
    complaint = re.sub('\n', '', complaint)
    return complaint

# Remove censored words from complaint
def clean_censored(complaint):
    complaint = re.sub('[XXXX]', '', complaint)
    return complaint

# Turn everything into lowercase
def clean_lowercase(complaint):
    complaint = complaint.lower()
    return complaint

# Remove numbers from complaint
def clean_numbers(complaint):
    complaint = re.sub('\w*\d\w', '', complaint)
    return complaint

__Application of narrative cleaning__

In [8]:
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_url)
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_punctuation)
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_nonsense)
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_censored)
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_lowercase)
df_selected["Consumer complaint narrative"] = df_selected["Consumer complaint narrative"].apply(clean_numbers)

__Initializing the bag of words__

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# vect (bag of words)
count_vect = CountVectorizer(
    stop_words="english",
    #ngram_range=(1,2), # bigrammen
    min_df=2, # only keep words that appear twice
    max_df=0.5 # appears max in 50% of documents
)

X_train_counts = count_vect.fit_transform(df_selected["Consumer complaint narrative"])

__Principal components are chosen__

In [10]:
from sklearn.decomposition import TruncatedSVD

tSVD = TruncatedSVD(n_components=20)

principal_components = tSVD.fit_transform(X_train_counts)
print(principal_components.shape)

(485701, 20)


In [11]:
principal_components_df = pd.DataFrame(principal_components)

__One hot encoding of extra columns__

In [12]:
#One hot encoding
df_selected['Issue'] = pd.Categorical(df_selected['Issue'])
df_dummies = pd.get_dummies(df_selected['Issue'], prefix = 'issue')

df_concat = pd.concat([principal_components_df, df_dummies], axis = 1)

__Split into train and test__

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df_concat, df_selected['Product'])
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(364275, 181)
(121426, 181)
(364275,)
(121426,)


__Application of linear SVM__

In [14]:
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report # do more stats

classifier = svm.LinearSVC()
classifier.fit(X_train, Y_train)

test_predictions = classifier.predict(X_test)
print(classification_report(test_predictions, Y_test))

  _warn_prf(average, modifier, msg_start, len(result))


                                                                              precision    recall  f1-score   support

                                                     Bank account or service       1.00      1.00      1.00      3761
                                                 Checking or savings account       0.99      1.00      1.00      4718
                                                               Consumer Loan       0.96      0.78      0.86      2932
                                                                 Credit card       1.00      1.00      1.00      4630
                                                 Credit card or prepaid card       0.95      0.97      0.96      7785
                                                            Credit reporting       1.00      1.00      1.00      7811
Credit reporting, credit repair services, or other personal consumer reports       1.00      0.97      0.98     36530
                                                       