In [47]:
from matplotlib import pyplot as plt
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn import svm
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

In [48]:
df = pd.read_csv('consumer_complaints_small.csv')

In [49]:
df.head()

Unnamed: 0,product,consumer_complaint_narrative
0,Debt collection,XXXX has claimed I owe them {$27.00} for XXXX ...
1,Consumer Loan,Due to inconsistencies in the amount owed that...
2,Mortgage,In XX/XX/XXXX my wages that I earned at my job...
3,Mortgage,I have an open and current mortgage with Chase...
4,Mortgage,XXXX was submitted XX/XX/XXXX. At the time I s...


In [50]:
len(df)

66806

In [51]:
df.rename(columns = {'product':'Product','consumer_complaint_narrative':'Consumer complaint narrative'},inplace = True)

# Reduce Number of Observations

In [52]:
#df = df[:1000]

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66806 entries, 0 to 66805
Data columns (total 2 columns):
Product                         66806 non-null object
Consumer complaint narrative    66806 non-null object
dtypes: object(2)
memory usage: 1.0+ MB


In [54]:
df.Product.value_counts()

Debt collection            17552
Mortgage                   14919
Credit reporting           12526
Credit card                 7929
Bank account or service     5711
Consumer Loan               3678
Student loan                2128
Prepaid card                 861
Payday loan                  726
Money transfers              666
Other financial service      110
Name: Product, dtype: int64

In [55]:
import warnings
warnings.filterwarnings('ignore')

In [56]:
def NB_topic_classification(review_data, feature):
    

    X_train, X_test, y_train, y_test = train_test_split(review_data, feature , \
                                                        test_size=0.20, random_state=0)

    text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

#     parameters = {'tfidf__min_df':[2, 3],
#                   'tfidf__max_df': [0.995 , 0.999],
#                   'tfidf__stop_words':[None,"english"],
#                   'clf__alpha': [0.5, 1.0]}
    
#     parameters = {'tfidf__min_df':[2, 3],
#                   'tfidf__max_df': [0.999],
#                   'tfidf__stop_words':["english"],
#                   'clf__alpha': [1.0]}

    parameters = {'tfidf__min_df':[1, 2, 3],
                  'tfidf__max_df': [0.995 , 0.999, 1.0],
                  'tfidf__stop_words':[None,"english"],
                  'clf__alpha': [0.5, 1.0, 2.0 , 5.0]}


    metric =  "f1_macro"

    gs_clf = GridSearchCV(text_clf, param_grid=parameters, scoring=metric, cv=5)


    gs_clf = gs_clf.fit(X_train, y_train)

    for param_name in gs_clf.best_params_:
        print("{} : {}".format(param_name , gs_clf.best_params_[param_name]))
    print("best f1 score:", gs_clf.best_score_)

    clf_alpha = gs_clf.best_params_["clf__alpha"]
    tfidf_min_df = gs_clf.best_params_["tfidf__min_df"]
    tfidf_max_df = gs_clf.best_params_["tfidf__max_df"]
    tfidf_stop_words = gs_clf.best_params_["tfidf__stop_words"]

    classifier = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words=tfidf_stop_words,\
                                  min_df=tfidf_min_df)),
        ('clf',  MultinomialNB(alpha = clf_alpha ))])

    clf = classifier.fit(X_train,  y_train)

    labels=sorted(np.unique(feature))
    labels = list(map(str, labels))

    predicted = classifier.predict(X_test)
    
    
    ##########################
    
    print(classification_report(y_test, predicted))
##########################################
    
    
          
    
    ##########################
    
    return predicted


#categ_data = pd.read_csv("consumer_complaints_small.csv" , header = 0)   
categ_data = df
categ_data.head()

#classes = categ_data.label.values
classes = categ_data.Product.values

classes = [i.split(",") for i in classes]
print(classes, )


mlb = MultiLabelBinarizer()
Y=mlb.fit_transform(classes)

classes_label = mlb.classes_

[['Debt collection'], ['Consumer Loan'], ['Mortgage'], ['Mortgage'], ['Mortgage'], ['Mortgage'], ['Mortgage'], ['Mortgage'], ['Credit card'], ['Consumer Loan'], ['Mortgage'], ['Credit card'], ['Consumer Loan'], ['Debt collection'], ['Debt collection'], ['Debt collection'], ['Debt collection'], ['Mortgage'], ['Credit reporting'], ['Student loan'], ['Credit reporting'], ['Credit reporting'], ['Credit reporting'], ['Debt collection'], ['Credit reporting'], ['Credit reporting'], ['Credit reporting'], ['Credit reporting'], ['Debt collection'], ['Credit reporting'], ['Credit reporting'], ['Debt collection'], ['Credit reporting'], ['Debt collection'], ['Mortgage'], ['Consumer Loan'], ['Credit card'], ['Debt collection'], ['Debt collection'], ['Bank account or service'], ['Debt collection'], ['Debt collection'], ['Credit reporting'], ['Debt collection'], ['Debt collection'], ['Mortgage'], ['Credit card'], ['Student loan'], ['Mortgage'], ['Credit card'], ['Consumer Loan'], ['Debt collection'], 

In [57]:
categ_data.rename(columns={'Consumer complaint narrative':'Consumer_complaint_narrative'},inplace =True)

In [58]:
categ_data.head()

Unnamed: 0,Product,Consumer_complaint_narrative
0,Debt collection,XXXX has claimed I owe them {$27.00} for XXXX ...
1,Consumer Loan,Due to inconsistencies in the amount owed that...
2,Mortgage,In XX/XX/XXXX my wages that I earned at my job...
3,Mortgage,I have an open and current mortgage with Chase...
4,Mortgage,XXXX was submitted XX/XX/XXXX. At the time I s...


In [59]:
classes[:5]

[['Debt collection'],
 ['Consumer Loan'],
 ['Mortgage'],
 ['Mortgage'],
 ['Mortgage']]

In [60]:
classes_label

array(['Bank account or service', 'Consumer Loan', 'Credit card',
       'Credit reporting', 'Debt collection', 'Money transfers',
       'Mortgage', 'Other financial service', 'Payday loan',
       'Prepaid card', 'Student loan'], dtype=object)

In [61]:
categ_data['Product'].unique()

array(['Debt collection', 'Consumer Loan', 'Mortgage', 'Credit card',
       'Credit reporting', 'Student loan', 'Bank account or service',
       'Payday loan', 'Money transfers', 'Other financial service',
       'Prepaid card'], dtype=object)

In [62]:
len(Y)

66806

In [63]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# # ambience = Y[:,0]
# # food = Y[:,2]
# # miscellaneous = Y[:,1]
# # service = Y[:,4]
# # price = Y[:,3]


# Bank_account_or_service = Y[:,0]


Debt_collection = Y[:,0]
Mortgage = Y[:,1]
Credit_reporting = Y[:,2]
Credit_card = Y[:,3]
Bank_account_or_service = Y[:,4]
Consumer_Loan = Y[:,5]
Student_loan = Y[:,6]
Prepaid_card = Y[:,7]
Payday_loan = Y[:,8]
Money_transfers = Y[:,9]
Other_financial_service = Y[:,10]


# Debt collection =             17552
# Mortgage                   14919
# Credit reporting           12526
# Credit card                 7929
# Bank account or service     5711
# Consumer Loan               3678
# Student loan                2128
# Prepaid card                 861
# Payday loan                  726
# Money transfers              666
# Other financial service      110


predicted_Debt_collection =  NB_topic_classification(categ_data.Consumer_complaint_narrative, Debt_collection)
predicted_Mortgage= NB_topic_classification(categ_data.Consumer_complaint_narrative, Mortgage)
predicted_Credit_reporting =  NB_topic_classification(categ_data.Consumer_complaint_narrative, Credit_reporting)
predicted_Credit_card =  NB_topic_classification(categ_data.Consumer_complaint_narrative, Credit_card)
predicted_Bank_account_or_service =  NB_topic_classification(categ_data.Consumer_complaint_narrative, Bank_account_or_service)


predicted_Consumer_Loan =  NB_topic_classification(categ_data.Consumer_complaint_narrative, Consumer_Loan)
predicted_Student_loan= NB_topic_classification(categ_data.Consumer_complaint_narrative, Student_loan)
predicted_Prepaid_card =  NB_topic_classification(categ_data.Consumer_complaint_narrative, Prepaid_card)
predicted_Payday_loan =  NB_topic_classification(categ_data.Consumer_complaint_narrative, Payday_loan)
predicted_Money_transfers =  NB_topic_classification(categ_data.Consumer_complaint_narrative, Money_transfers)
predicted_Other_financial_service =  NB_topic_classification(categ_data.Consumer_complaint_narrative, Other_financial_service)





X_train, X_test, y_train, y_test = train_test_split(categ_data.Consumer_complaint_narrative, Y , \
                                                    test_size=0.20, random_state=0)


clf__alpha : 0.5
tfidf__max_df : 0.995
tfidf__min_df : 3
tfidf__stop_words : english
best f1 score: 0.7485454819744489
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     12155
           1       0.95      0.39      0.55      1207

    accuracy                           0.94     13362
   macro avg       0.95      0.69      0.76     13362
weighted avg       0.94      0.94      0.93     13362

clf__alpha : 0.5
tfidf__max_df : 0.995
tfidf__min_df : 3
tfidf__stop_words : english
best f1 score: 0.6159641699034331
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     12600
           1       0.91      0.14      0.25       762

    accuracy                           0.95     13362
   macro avg       0.93      0.57      0.61     13362
weighted avg       0.95      0.95      0.93     13362

clf__alpha : 0.5
tfidf__max_df : 0.995
tfidf__min_df : 3
tfidf__stop_words : english
best f1 score: 0.729157

In [None]:
zip_all = list(zip(predicted_Debt_collection, predicted_Mortgage, predicted_Credit_reporting, predicted_Credit_card, predicted_Bank_account_or_service,predicted_Consumer_Loan,predicted_Student_loan,predicted_Prepaid_card,predicted_Payday_loan,predicted_Money_transfers,predicted_Other_financial_service))

print(classification_report\
      (y_test, np.array(list(zip_all)), target_names=mlb.classes_)) 

# if the y_pred falls into 2 classes, we need to convert it to be just 1 class ,following the One-vs-Rest procedure for single-label multiclass classification
# Then recalculate and print the accuracy metrics summary score again

# Adjust the result to be One-vs-rest strategy converting multi-label result into single label based on the priority

In [None]:
target_names=mlb.classes_

In [None]:
for j in range(len(y_test[0])):
    tot = 0
    for i in range(len(y_test)):
        tot += y_test[i][j]
    print("Class ", str(j)," ",target_names[j], "total obs =", tot)

In [None]:
raw_predicted = np.array(list(zip_all))

In [None]:
agg = df.groupby('Product',as_index = False).count()
agg_sorted = agg.rename(columns={'Consumer_complaint_narrative':'Total_Numbers'}).sort_values(by = ['Total_Numbers'])
agg_sorted

In [None]:
def prioritize(multilabel_list):
    if multilabel_list[7] == 1:
        return np.array([0,0,0,0,0,0,0,1,0,0,0])
    elif multilabel_list[5] == 1:
        return np.array([0,0,0,0,0,1,0,0,0,0,0])
    elif multilabel_list[8] == 1:
        return np.array([0,0,0,0,0,0,0,0,1,0,0])
    elif multilabel_list[9] == 1:
        return np.array([0,0,0,0,0,0,0,0,0,1,0])
    elif multilabel_list[10] == 1:
        return np.array([0,0,0,0,0,0,0,0,0,0,1])
    elif multilabel_list[1] == 1:
        return np.array([0,1,0,0,0,1,0,0,0,0,0])
    elif multilabel_list[0] == 1:
        return np.array([1,0,0,0,0,0,0,0,0,0,0])
    elif multilabel_list[2] == 1:
        return np.array([0,0,1,0,0,0,0,0,0,0,0])
    elif multilabel_list[3] == 1:
        return np.array([0,0,0,1,0,0,0,0,0,0,0])
    elif multilabel_list[6] == 1:
        return np.array([0,0,0,0,0,0,1,0,0,0,0])
    elif multilabel_list[4] == 1:
        return np.array([0,0,0,0,1,0,0,0,0,0,0])
    else :
        return np.array([0,0,0,0,0,0,0,0,0,0,0])

In [None]:
OVR_predicted = np.array([prioritize(i) for i in raw_predicted])

In [None]:
OVR_predicted

In [84]:
zip_all = list(zip(predicted_Debt_collection, predicted_Mortgage, predicted_Credit_reporting, predicted_Credit_card, predicted_Bank_account_or_service,predicted_Consumer_Loan,predicted_Student_loan,predicted_Prepaid_card,predicted_Payday_loan,predicted_Money_transfers,predicted_Other_financial_service))

print(classification_report\
      (y_test, np.array(list(zip_all)), target_names=mlb.classes_, digits = 4)) 

                         precision    recall  f1-score   support

Bank account or service     0.9514    0.3894    0.5526      1207
          Consumer Loan     0.9083    0.1430    0.2472       762
            Credit card     0.9006    0.3782    0.5327      1605
       Credit reporting     0.8559    0.6736    0.7539      2390
        Debt collection     0.8918    0.7027    0.7860      3495
        Money transfers     0.0000    0.0000    0.0000       132
               Mortgage     0.9503    0.8432    0.8935      3036
Other financial service     0.0000    0.0000    0.0000        16
            Payday loan     0.0000    0.0000    0.0000       125
           Prepaid card     1.0000    0.0187    0.0368       160
           Student loan     0.9752    0.2719    0.4252       434

              micro avg     0.9076    0.5937    0.7178     13362
              macro avg     0.6758    0.3110    0.3844     13362
           weighted avg     0.8918    0.5937    0.6857     13362
            samples avg

In [83]:
zip_all = list(zip(predicted_Debt_collection, predicted_Mortgage, predicted_Credit_reporting, predicted_Credit_card, predicted_Bank_account_or_service,predicted_Consumer_Loan,predicted_Student_loan,predicted_Prepaid_card,predicted_Payday_loan,predicted_Money_transfers,predicted_Other_financial_service))

print(classification_report\
      (y_test, OVR_predicted, target_names=mlb.classes_, digits = 4)) 

                         precision    recall  f1-score   support

Bank account or service     0.9514    0.3894    0.5526      1207
          Consumer Loan     0.9083    0.1430    0.2472       762
            Credit card     0.9111    0.3769    0.5333      1605
       Credit reporting     0.8559    0.6736    0.7539      2390
        Debt collection     0.8977    0.6801    0.7739      3495
        Money transfers     0.0000    0.0000    0.0000       132
               Mortgage     0.9502    0.8426    0.8932      3036
Other financial service     0.0000    0.0000    0.0000        16
            Payday loan     0.0000    0.0000    0.0000       125
           Prepaid card     1.0000    0.0187    0.0368       160
           Student loan     0.9752    0.2719    0.4252       434

              micro avg     0.8979    0.5875    0.7102     13362
              macro avg     0.6773    0.3088    0.3833     13362
           weighted avg     0.8946    0.5875    0.6825     13362
            samples avg

# ======================================================

In [None]:
np.unique(np.array(list(zip_all)), axis=0)

In [None]:
len(np.unique(np.array(list(zip_all)), axis=0))

In [None]:
np.unique(Y, axis=0)

In [None]:
len(np.unique(Y, axis=0))

In [None]:
np.unique(OVR_predicted, axis=0)

In [82]:
len(np.unique(OVR_predicted, axis=0))

9