# Full Categorized Classifer

If you followed the data exploration notebook, you would know that many columns were truncated, removing information from many entries. In this short notebook, all but one (company name) column were un-truncated. This was only trained on a DNN, and a Logistic Regression, since these models preformed the best

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re

#Defining what dtype to convert each column to
#numberic columns are transformed after reading in
dtype_dict = {'Product':"category",
             'Consumer consent provided?': "category",
             'Submitted via': "category",
             'Company response to consumer': "category",
             'Consumer disputed?': "category", 
             'Issue': "category"}

#read in .csv file, dates are parsed into datetime objects. 
#The Complaint ID is Unique in every entry, so it can be used as index
df = pd.read_csv('Consumer_Complaints.csv',
                 index_col=['Complaint ID'],
                 parse_dates=["Date received","Date sent to company"],
                 dtype=dtype_dict)

#This will replace ending '-' to 5 (average linespace of 10)
regexReplaceDash = r"(\d+)(-)$"
df['ZIP code'] = df['ZIP code'].str.replace(regexReplaceDash, r'\g<1>5')

#This will change ending XX to 50 (average linespace of 100)
regex_XX = r'(\d{3})(XX)'
df['ZIP code'] = df['ZIP code'].str.replace(regex_XX, r'\g<1>50')

#This will remove all other entries that are still not 5 digits
regexRemove = r'\D+'
df['ZIP code'] = df['ZIP code'].replace(regexRemove, np.nan, regex=True)

#imputes the mean for nan 
imputeMean = df['ZIP code'].astype(np.float).mean()
df['ZIP code'] = df['ZIP code'].astype(np.float).fillna(imputeMean)

#Transforming 2 unique valued col to float boolean
booleanize = {'Yes': 1, 'No': 0}
df['Timely response?'] = pd.Series(df['Timely response?'].map(booleanize), dtype = np.float)


def entryOrNull(strVal):
    return 1.0 if strVal is not np.nan else 0.0

df['Consumer complaint narrative submitted?'] = df['Consumer complaint narrative'].apply(entryOrNull)

def dtToCols(df, dtcolumn):
    df["{} day".format(dtcolumn)] = df[dtcolumn].dt.day
    df["{} month".format(dtcolumn)] = df[dtcolumn].dt.month
    df["{} year".format(dtcolumn)] = df[dtcolumn].dt.year
    
dtToCols(df, "Date received")
dtToCols(df, "Date sent to company")

df["Consumer consent provided?"] = df["Consumer consent provided?"].cat.add_categories("Not recorded").fillna("Not recorded")

df = df.drop(df[df["Company response to consumer"].isna()].index)

dfInProgress = df[df["Company response to consumer"] == "In progress"]
df = df[df["Company response to consumer"] != "In progress"]

dfUntimelyResponse = df[df["Company response to consumer"] == "Untimely response"]
df = df[df["Company response to consumer"] != "Untimely response"]


#function to apply to column to convert less common results to 'Other', as well as NaN
def convertToOther(value, keepList):
    if (value == ''):
        return "Other"
    else:
        return value if value in keepList else "Other"
    
#Lists top 100 value counts (allowed to exclude values), turns NaN to '' to others, converts to category dtype
def cleanReduceConvert(df, column, blackList=[]):
    keepList = []
    for category in df[column].value_counts().head(100).index.tolist():
        if (category.lower().split()[0] != "other"):
            keepList.append(category)
    for category in blackList:
        try:
            keepList.remove(category)
        except ValueError:
            pass

    df[column].fillna('', inplace=True)
    return pd.Series(df[column].apply(convertToOther, args=(keepList,)), dtype = 'category')

df['Company'] = cleanReduceConvert(df, 'Company') #reduce non top 100 companys to 'others'

#replace NA with "Not recorded", then cast dtype to category
df['Sub-issue'] = df["Sub-issue"].fillna("Not recorded").astype('category')
df['Sub-product'] = df["Sub-product"].fillna("Not recorded").astype('category')

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

#data columns not be used for the model
dropList = ["Consumer complaint narrative",
            "Company public response",
            "State",
            "Tags",
            "Consumer disputed?",
            "Date received", 
            "Date sent to company",
            "Company response to consumer"]
X = df.drop(dropList, axis=1)
Y = df["Company response to consumer"]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

#Columns to be standard scaled/imputed
numeric_features = ['ZIP code',
                    'Date received day',
                    'Date received month',
                    'Date received year',
                    'Date sent to company day',
                    'Date sent to company month',
                    'Date sent to company year']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

#Columns to one hot encoded
categorical_features = ['Product',
           'Sub-product',
           'Issue',
           'Sub-issue',
           'Company',
           'Consumer consent provided?',
           'Submitted via',
           'Timely response?']
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

#building the column transformer with both transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

#fit the preprocessor, then transform trainging and test set, assign sparse matrix to variables
preprocessor.fit(X)
encX_train = preprocessor.transform(X_train)
encX_test = preprocessor.transform(X_test)

#Creating Dummy variable for target. Will be neccessary for DNN. 
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

dummy_y = np_utils.to_categorical(encoded_Y)

dummy_y_train = np_utils.to_categorical(encoder.transform(y_train))
dummy_y_test = np_utils.to_categorical(encoder.transform(y_test)) 

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)
Using TensorFlow backend.


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

lr = LogisticRegression(n_jobs=-1, solver='saga', penalty='l1', multi_class='multinomial')
lr_para = {'C':[1.0,0.1,0.01], 
           'class_weight':[None,'balanced'],
           'max_iter':[50,100,125]}

#Apply grid search with above parameters specified
fitmodel = GridSearchCV(lr, lr_para,cv=5, scoring='accuracy', n_jobs=-1)
fitmodel.fit(encX_train,y_train)

#store the best fitting LogisiticRegression(), create prediciton from X_test data
bestfitLR = fitmodel.best_estimator_

bestfitLR.fit(encX_train,y_train)
y_pred = bestfitLR.predict(encX_test)
print(bestfitLR.score(encX_test,y_test))

#display the result
print(classification_report(y_test, y_pred))



0.8048359240069085
                                 precision    recall  f1-score   support

                         Closed       0.49      0.05      0.09      5216
        Closed with explanation       0.81      0.98      0.89    327695
    Closed with monetary relief       0.55      0.10      0.18     23785
Closed with non-monetary relief       0.59      0.11      0.19     53398
             Closed with relief       0.57      0.23      0.33      1537
          Closed without relief       0.78      0.91      0.84      5249

                       accuracy                           0.80    416880
                      macro avg       0.63      0.40      0.42    416880
                   weighted avg       0.77      0.80      0.75    416880



# Head-to-Head Comparison

By truncating the prediction of the mutliclass classifier, a clear comparison can be made on well the predicitons are, if disregarding that value in the multiclass prediction being more specific in its output.

In [12]:
twoOutputsDict = {"Closed with explanation":"Closed without relief", 
              "Closed with non-monetary relief":"Closed with relief",
              "Closed with monetary relief":"Closed with relief",
              "Closed without relief":"Closed without relief", 
              "Closed":"Closed without relief",
              "Closed with relief":"Closed with relief"}
y_pred_trunc = [twoOutputsDict[x] for x in y_pred]
y_test_trunc = y_test.map(twoOutputsDict)

print(classification_report(y_test_trunc, y_pred_trunc))

                       precision    recall  f1-score   support

   Closed with relief       0.60      0.12      0.20     78720
Closed without relief       0.83      0.98      0.90    338160

             accuracy                           0.82    416880
            macro avg       0.71      0.55      0.55    416880
         weighted avg       0.78      0.82      0.76    416880



Compared to how training with an already truncated target (Recall: 0.66 and F1: 0.43 for 'Closed with relief), the recall and f1 scores are significantly worse. It is perhaps better not to train using this many labels.

# DNN Classifier

There is a good chance that keeping all the categories will allow a DNN classifier to Utilize the non-linear interaction between the features better.

In [3]:
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
from keras_tqdm import TQDMNotebookCallback

#Build model with 2 FC layers each 50 nodes (relu activation), and 1 node output output layer (sigmoid activation)
model = Sequential()
model.add(Dense(25, init='uniform', input_dim=601, activation='relu'))
model.add(Dense(25, init='uniform', activation='relu'))
model.add(Dense(6, init= 'uniform', activation = 'softmax'))

#setup earlystop callback
earlystop_callback = EarlyStopping(monitor='accuracy', min_delta=0.0001, patience=2)

#Compiled with adam optimizer, binary crossentropy loss function, accuracy metric for evaluation
model.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])

#train the model over 100 epochs
model.fit(encX_train, dummy_y_train, epochs=10, batch_size=5, callbacks=[TQDMNotebookCallback(), earlystop_callback], verbose=0)

#print the results
_, accuracy = model.evaluate(encX_test, dummy_y_test)
print('Accuracy: %.2f' % (accuracy*100))

#Save the trained model into an .h5 file
model.save("unfiltered_data-model-50-50.h5")


  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()





HBox(children=(IntProgress(value=0, description='Training', max=10, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='Epoch 0', max=972719, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Epoch 1', max=972719, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Epoch 2', max=972719, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Epoch 3', max=972719, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Epoch 4', max=972719, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Epoch 5', max=972719, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Epoch 6', max=972719, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Epoch 7', max=972719, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Epoch 8', max=972719, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=0, description='Epoch 9', max=972719, style=ProgressStyle(description_width='…


Accuracy: 81.21


In [36]:
import tensorflow as tf

pred = model.predict(encX_test)

pred_h1 = tf.one_hot(tf.argmax(pred, axis = 1), depth = 6)

print(classification_report(dummy_y_test, pred_h1.eval(session=tf.compat.v1.Session()), target_names= encoder.classes_))

                                 precision    recall  f1-score   support

                         Closed       0.57      0.12      0.19      5251
        Closed with explanation       0.82      0.98      0.89    327475
    Closed with monetary relief       0.59      0.09      0.16     23625
Closed with non-monetary relief       0.65      0.20      0.31     53438
             Closed with relief       0.45      0.32      0.37      1658
          Closed without relief       0.74      0.86      0.80      5433

                      micro avg       0.81      0.81      0.81    416880
                      macro avg       0.64      0.43      0.45    416880
                   weighted avg       0.78      0.81      0.77    416880
                    samples avg       0.81      0.81      0.81    416880



Doing a weighted average of f1 scores, it appears that this model also does not out preform the binary logistic regression, and thus the LR model continues to be the best choice for the final production model.