In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re

#Defining what dtype to convert each column to
#numberic columns are transformed after reading in
dtype_dict = {'Product':"category",
             'Consumer consent provided?': "category",
             'Submitted via': "category",
             'Copany response to consumer': "category",
             'Consumer disputed?': "category"}

#read in .csv file, dates are parsed into datetime objects. 
#The Complaint ID is Unique in every entry, so it can be used as index
df = pd.read_csv('Consumer_Complaints.csv',
                 index_col=['Complaint ID'],
                 parse_dates=["Date received","Date sent to company"],
                 dtype=dtype_dict)

#This will replace ending '-' to 5 (average linespace of 10)
regexReplaceDash = r"(\d+)(-)$"
df['ZIP code'] = df['ZIP code'].str.replace(regexReplaceDash, r'\g<1>5')

#This will change ending XX to 50 (average linespace of 100)
regex_XX = r'(\d{3})(XX)'
df['ZIP code'] = df['ZIP code'].str.replace(regex_XX, r'\g<1>50')

#This will remove all other entries that are still not 5 digits
regexRemove = r'\D+'
df['ZIP code'] = df['ZIP code'].replace(regexRemove, np.nan, regex=True)

#imputes the mean for nan 
imputeMean = df['ZIP code'].astype(np.float).mean()
df['ZIP code'] = df['ZIP code'].astype(np.float).fillna(imputeMean)

#Transforming 2 unique valued col to float boolean
booleanize = {'Yes': 1, 'No': 0}
df['Timely response?'] = pd.Series(df['Timely response?'].map(booleanize), dtype = np.float)

#function to apply to column to convert less common results to 'Other', as well as NaN
def convertToOther(value, keepList):
    if (value == ''):
        return "Other"
    else:
        return value if value in keepList else "Other"
    
#Lists top 23 value counts (allowed to exclude values), turns NaN to '' to others, converts to category dtype
def cleanReduceConvert(df, column, blackList=[]):
    keepList = []
    for category in df[column].value_counts().head(23).index.tolist():
        if (category.lower().split()[0] != "other"):
            keepList.append(category)
    for category in blackList:
        try:
            keepList.remove(category)
        except ValueError:
            pass

    df[column].fillna('', inplace=True)
    return pd.Series(df[column].apply(convertToOther, args=(keepList,)), dtype = 'category')

df['Sub-product'] = cleanReduceConvert(df, 'Sub-product', blackList= ['I do not know'])
df['Issue'] = cleanReduceConvert(df, 'Issue')
df['Sub-issue'] = cleanReduceConvert(df, 'Sub-issue')
df['Company'] = cleanReduceConvert(df, 'Company')

def entryOrNull(strVal):
    return 1.0 if strVal is not np.nan else 0.0

df['Consumer complaint narrative submitted?'] = df['Consumer complaint narrative'].apply(entryOrNull)

def dtToCols(df, dtcolumn):
    df["{} day".format(dtcolumn)] = df[dtcolumn].dt.day
    df["{} month".format(dtcolumn)] = df[dtcolumn].dt.month
    df["{} year".format(dtcolumn)] = df[dtcolumn].dt.year
    
dtToCols(df, "Date received")
dtToCols(df, "Date sent to company")

df["Consumer consent provided?"] = df["Consumer consent provided?"].cat.add_categories("Not recorded").fillna("Not recorded")

df = df.drop(df[df["Company response to consumer"].isna()].index)

dfInProgress = df[df["Company response to consumer"] == "In progress"]
df = df[df["Company response to consumer"] != "In progress"]

dfUntimelyResponse = df[df["Company response to consumer"] == "Untimely response"]
df = df[df["Company response to consumer"] != "Untimely response"]

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

#data columns not be used for the model
dropList = ["Consumer complaint narrative",
            "Company public response",
            "State",
            "Tags",
            "Consumer disputed?",
            "Date received", 
            "Date sent to company",
            "Company response to consumer"]
X = df.drop(dropList, axis=1)
Y = df["Company response to consumer"]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

#Columns to be standard scaled/imputed
numeric_features = ['ZIP code',
                    'Date received day',
                    'Date received month',
                    'Date received year',
                    'Date sent to company day',
                    'Date sent to company month',
                    'Date sent to company year']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

#Columns to one hot encoded
categorical_features = ['Product',
           'Sub-product',
           'Issue',
           'Sub-issue',
           'Company',
           'Consumer consent provided?',
           'Submitted via',
           'Timely response?']
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

#building the column transformer with both transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

#fit the preprocessor, then transform trainging and test set, assign sparse matrix to variables
preprocessor.fit(X)
encX_train = preprocessor.transform(X_train)
encX_test = preprocessor.transform(X_test)

#Creating Dummy variable for target. Will be neccessary for DNN. 
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

dummy_y = np_utils.to_categorical(encoded_Y)

dummy_y_train = np_utils.to_categorical(encoder.transform(y_train))
dummy_y_test = np_utils.to_categorical(encoder.transform(y_test)) 

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)
Using TensorFlow backend.


In [2]:
def get_column_names_from_ColumnTransformer(column_transformer):    
    col_name = []
    for transformer_in_columns in column_transformer.transformers_[:-1]:#the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
        if isinstance(transformer_in_columns[1],Pipeline): 
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names()
        except AttributeError: # if no 'get_feature_names' function, use raw column name
            names = raw_col_name
        if isinstance(names,np.ndarray): # eg.
            col_name += names.tolist()
        elif isinstance(names,list):
            col_name += names    
        elif isinstance(names,str):
            col_name.append(names)
    return col_name

processedColumns = get_column_names_from_ColumnTransformer(preprocessor)

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

lr = LogisticRegression(n_jobs=-1, solver='saga', penalty='l1', multi_class='multinomial')
lr_para = {'C':[1.0,0.1,0.01], 
           'class_weight':[None,'balanced'],
           'max_iter':[50,100,125]}

#Apply grid search with above parameters specified
fitmodel = GridSearchCV(lr, lr_para,cv=5, scoring='accuracy', n_jobs=-1)
fitmodel.fit(encX_train,y_train)

#store the best fitting LogisiticRegression(), create prediciton from X_test data
bestfitLR = fitmodel.best_estimator_

bestfitLR.fit(encX_train,y_train)
y_pred = bestfitLR.predict(encX_test)
print(bestfitLR.score(encX_test,y_test))

#display the result
print(classification_report(y_test, y_pred))



0.79930675494147
                                 precision    recall  f1-score   support

                         Closed       0.00      0.00      0.00      5325
        Closed with explanation       0.81      0.99      0.89    327800
    Closed with monetary relief       0.51      0.02      0.03     23743
Closed with non-monetary relief       0.52      0.07      0.13     53102
             Closed with relief       0.48      0.11      0.18      1620
          Closed without relief       0.76      0.93      0.83      5290

                       accuracy                           0.80    416880
                      macro avg       0.51      0.35      0.34    416880
                   weighted avg       0.74      0.80      0.73    416880



In [48]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

bnb = BernoulliNB()
bnb_para = {'alpha':[1,2,10,0],
            'fit_prior': [True,False],
            'binarize': [0,0.1, 0.5]}

#Apply grid search with above parameters specified
fitmodel = GridSearchCV(bnb, bnb_para,cv=3, scoring='accuracy', n_jobs=-1, verbose=10)
fitmodel.fit(encX_train,y_train)

#store the best fitting GradientBoostingClassifier(), create prediciton from X_test data
bestfitBNB = fitmodel.best_estimator_

bestfitBNB.fit(encX_train,y_train)
y_pred = bestfitBNB.predict(encX_test)
print(bestfitBNB.score(encX_test,y_test))

#display the result
print(classification_report(y_test, y_pred))
print(bestfitBNB)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   28.3s
[Parallel(n_jobs=-1)]: Done  49 out of  72 | elapsed:   34.6s remaining:   16.2s
[Parallel(n_jobs=-1)]: Done  57 out of  72 | elapsed:   36.2s remaining:    9.5s
[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:   40.8s remaining:    4.3s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   41.7s finished


0.4593959892535022
                                 precision    recall  f1-score   support

                         Closed       0.06      0.36      0.10      5325
        Closed with explanation       0.87      0.45      0.59    327800
    Closed with monetary relief       0.21      0.62      0.31     23743
Closed with non-monetary relief       0.27      0.45      0.34     53102
             Closed with relief       0.06      0.48      0.11      1620
          Closed without relief       0.08      0.63      0.14      5290

                       accuracy                           0.46    416880
                      macro avg       0.26      0.50      0.26    416880
                   weighted avg       0.74      0.46      0.53    416880

BernoulliNB(alpha=10, binarize=0.1, class_prior=None, fit_prior=True)


In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

#Build model with 2 FC layers each 50 nodes (relu activation), and 1 node output output layer (sigmoid activation)
model = Sequential()
model.add(Dense(50, init='uniform', input_dim=128, activation='relu'))
model.add(Dense(50, init='uniform', activation='relu'))
model.add(Dense(6, init= 'uniform', activation = 'softmax'))

#setup earlystop callback
earlystop_callback = EarlyStopping(monitor='accuracy', min_delta=0.0001, patience=3)

#Compiled with adam optimizer, binary crossentropy loss function, accuracy metric for evaluation
model.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])

#train the model over 100 epochs
model.fit(encX_train, dummy_y_train, epochs=10, batch_size=10, callbacks=[earlystop_callback])

#print the results
_, accuracy = model.evaluate(encX_test, dummy_y_test)
print('Accuracy: %.2f' % (accuracy*100))

#Save the trained model into an .h5 file
model.save("multiclass-model-50-50.h5")

#see the classification report
#print(classification_report(dummy_y_test, model.predict_classes(encX_test)))



  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Accuracy: 80.38


ValueError: Classification metrics can't handle a mix of multilabel-indicator and multiclass targets

In [3]:
#Save the trained model into an .h5 file
model.save("multiclass-model-50-50.h5")

In [72]:
import tensorflow as tf

pred = model.predict(encX_test)

pred_h1 = tf.one_hot(tf.argmax(pred, axis = 1), depth = 6)

print(classification_report(dummy_y_test, pred_h1))

              precision    recall  f1-score   support

           0       0.69      0.01      0.02      5243
           1       0.81      0.99      0.89    327389
           2       0.48      0.02      0.03     23767
           3       0.63      0.11      0.19     53472
           4       0.44      0.08      0.13      1574
           5       0.77      0.90      0.83      5435

   micro avg       0.80      0.80      0.80    416880
   macro avg       0.64      0.35      0.35    416880
weighted avg       0.76      0.80      0.74    416880
 samples avg       0.80      0.80      0.80    416880



In [57]:
pred_h1 = []

for vect in pred:
    pred_h1 = np.append(pred_h1,(vect == vect.max()).astype(int))
    
print(pred_h1)
    

[0. 1. 0. ... 0. 0. 0.]


In [60]:
import tensorflow as tf
pred_h1 = tf.one_hot(tf.argmax(pred, axis = 1), depth = 6)

print(pred_h1)

tf.Tensor(
[[0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 ...
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]], shape=(416880, 6), dtype=float32)


In [68]:
for idx, vect in enumerate(pred):
    assert (tf.argmax(vect) == tf.argmax(pred_h1[idx]))

In [67]:
for i in pred:
    pass