In [1]:
import os
import sys
import numpy as np
import pandas as pd
import pickle as pkl
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, LSTM
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras import utils
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix, confusion_matrix
from sklearn.model_selection import train_test_split

# Goal
For this project we want to use a LSTM model in order to predict the type of product a consumer is complaining about.  We could do preliminary analysis on the complaints, but it seems from glossing over a few of the complaints that the tokenization of the consumer complaint text should be sufficient in order to classify most of the values to a given product.  Let's try this.

# Import Data

In [3]:
cwd = os.getcwd()
df = pd.read_csv(cwd + '/data/Consumer_Complaints.csv')
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer Complaint,Company Public Response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date Sent to Company,Company Response to Consumer,Timely response?,Consumer disputed?,Complaint ID,Unnamed: 18
0,03-12-2014,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,M&T BANK CORPORATION,MI,48382,,,Referral,03/17/2014,Closed with explanation,Yes,No,759217,
1,10-01-2016,Credit reporting,,Incorrect information on credit report,Account status,I have outdated information on my credit repor...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",AL,352XX,,Consent provided,Web,10-05-2016,Closed with explanation,Yes,No,2141773,
2,10/17/2016,Consumer Loan,Vehicle loan,Managing the loan or lease,,I purchased a new car on XXXX XXXX. The car de...,,"CITIZENS FINANCIAL GROUP, INC.",PA,177XX,Older American,Consent provided,Web,10/20/2016,Closed with explanation,Yes,No,2163100,
3,06-08-2014,Credit card,,Bankruptcy,,,,AMERICAN EXPRESS COMPANY,ID,83854,Older American,,Web,06-10-2014,Closed with explanation,Yes,Yes,885638,
4,09/13/2014,Debt collection,Credit card,Communication tactics,Frequent or repeated calls,,,"CITIBANK, N.A.",VA,23233,,,Web,09/13/2014,Closed with explanation,Yes,Yes,1027760,


After looking at the data it can be seen that there are some NaN values in the consumer complaint column.  We must remove these if we want to procede with a purely NLP algorithm

In [4]:
col = ['Consumer Complaint', 'Product']
df = df[col]
df = df[pd.notnull(df['Consumer Complaint'])]
df.head()

Unnamed: 0,Consumer Complaint,Product
1,I have outdated information on my credit repor...,Credit reporting
2,I purchased a new car on XXXX XXXX. The car de...,Consumer Loan
7,An account on my credit report has a mistaken ...,Credit reporting
12,This company refuses to provide me verificatio...,Debt collection
16,This complaint is in regards to Square Two Fin...,Debt collection


In [5]:
df.isnull().sum()

Consumer Complaint    0
Product               0
dtype: int64

Some of the product types are seent to be sparse within the dataset.  We will procede with them included for now.  Additionally, some products are such as 'Credit reporting, credit repair services, or other personal consumer reports' are broad and the individual components of thees products are individual columns as well.  One could engineer these classifications to see how they would affect the model, but we do not have information to know why these products are classified as such and will leave them for now.

In [6]:
df.Product.value_counts()

Debt collection                                                                 63268
Credit reporting, credit repair services, or other personal consumer reports    49006
Mortgage                                                                        43837
Credit reporting                                                                31593
Credit card                                                                     18842
Student loan                                                                    16689
Bank account or service                                                         14887
Credit card or prepaid card                                                     10659
Consumer Loan                                                                    9474
Checking or savings account                                                      6489
Money transfer, virtual currency, or money service                               3089
Vehicle loan or lease                                 

In [7]:
len(df)

277814

# Feature Engineering

In [8]:
X = df['Consumer Complaint']
y = df.Product

In [9]:
test_size = 0.3
seed = 2143

Xwords_train, Xwords_test, ywords_train, ywords_test = train_test_split(X, y,
                                                    test_size=test_size, random_state=seed)

In [10]:
num_words = 1000
tokenize = text.Tokenizer(num_words=num_words, char_level=False)

In [11]:
tokenize.fit_on_texts(Xwords_train)
X_train = tokenize.texts_to_matrix(Xwords_train)
X_test = tokenize.texts_to_matrix(Xwords_test)

In [12]:
encoder = LabelEncoder()
encoder.fit(ywords_train)
y_train = encoder.transform(ywords_train)
y_test = encoder.transform(ywords_test)

In [13]:
num_cat = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_cat)
y_test = utils.to_categorical(y_test, num_cat)

In [14]:
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (138907, 1000)
x_test shape: (138907, 1000)
y_train shape: (138907, 18)
y_test shape: (138907, 18)


# Model: Architecture

In [15]:
batch_size = 32
epochs = 10

In [16]:
model = Sequential()
model.add(Dense(512, input_shape=(num_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_cat))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


# Model: Training

In [17]:
history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 125016 samples, validate on 13891 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Model: Results

In [18]:
score = model.evaluate(X_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score: {:.4f}'.format(score[0]))
print('Test accuracy: {:.4f}'.format(score[1]))

Test score: 0.8529
Test accuracy: 0.7266


In [20]:
actual_decoded = np.argmax(y_test, axis=1)
predictions = model.predict_classes(X_test)

In [21]:
ywords_test.value_counts()

Debt collection                                                                 31736
Credit reporting, credit repair services, or other personal consumer reports    24371
Mortgage                                                                        21838
Credit reporting                                                                15730
Credit card                                                                      9517
Student loan                                                                     8285
Bank account or service                                                          7391
Credit card or prepaid card                                                      5393
Consumer Loan                                                                    4790
Checking or savings account                                                      3287
Money transfer, virtual currency, or money service                               1618
Vehicle loan or lease                                 

We can note a few things from looking at the classification report:
1. The more frequent product labels have higher precision and recall
2. The 'Credit Card', 'Credit card or prepaid card', 'Credit card or prepaid card', and 'Credit reporting, credit repair services, or other personal consumer reports' product types may have lower precision and recall due to them not being orthogonal to one another
3. The sparse product labels should be thrown out as expected, not much to gain from them

In [22]:
print(classification_report(actual_decoded,  predictions, target_names=text_labels.tolist()))

                                                                              precision    recall  f1-score   support

                                                     Bank account or service       0.60      0.66      0.63      7391
                                                 Checking or savings account       0.52      0.39      0.45      3287
                                                               Consumer Loan       0.52      0.45      0.48      4790
                                                                 Credit card       0.55      0.72      0.62      9517
                                                 Credit card or prepaid card       0.54      0.28      0.37      5393
                                                            Credit reporting       0.65      0.67      0.66     15730
Credit reporting, credit repair services, or other personal consumer reports       0.74      0.68      0.71     24371
                                                       