In [1]:
from matplotlib import pyplot as plt
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
df = pd.read_csv('consumer_complaints_small.csv')

In [3]:
df.head()

Unnamed: 0,product,consumer_complaint_narrative
0,Debt collection,XXXX has claimed I owe them {$27.00} for XXXX ...
1,Consumer Loan,Due to inconsistencies in the amount owed that...
2,Mortgage,In XX/XX/XXXX my wages that I earned at my job...
3,Mortgage,I have an open and current mortgage with Chase...
4,Mortgage,XXXX was submitted XX/XX/XXXX. At the time I s...


In [4]:
len(df)

66806

In [5]:
df.rename(columns = {'product':'Product','consumer_complaint_narrative':'Consumer complaint narrative'},inplace = True)

# Reduce Number of Observations

In [6]:
#df = df[:1000]

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66806 entries, 0 to 66805
Data columns (total 2 columns):
Product                         66806 non-null object
Consumer complaint narrative    66806 non-null object
dtypes: object(2)
memory usage: 1.0+ MB


In [8]:
df.Product.value_counts()

Debt collection            17552
Mortgage                   14919
Credit reporting           12526
Credit card                 7929
Bank account or service     5711
Consumer Loan               3678
Student loan                2128
Prepaid card                 861
Payday loan                  726
Money transfers              666
Other financial service      110
Name: Product, dtype: int64

In [9]:
def NB_topic_classification(review_data, feature):
    

    X_train, X_test, y_train, y_test = train_test_split(review_data, feature , \
                                                        test_size=0.20, random_state=0)

    text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

    parameters = {'tfidf__min_df':[1, 2, 3],
                  'tfidf__max_df': [0.995 , 0.999, 1.0],
                  'tfidf__stop_words':[None,"english"],
                  'clf__alpha': [0.5, 1.0, 2.0 , 5.0]}

    metric =  "f1_macro"

    gs_clf = GridSearchCV(text_clf, param_grid=parameters, scoring=metric, cv=2)


    gs_clf = gs_clf.fit(X_train, y_train)

    for param_name in gs_clf.best_params_:
        print("{} : {}".format(param_name , gs_clf.best_params_[param_name]))
    print("best f1 score:", gs_clf.best_score_)

    clf_alpha = gs_clf.best_params_["clf__alpha"]
    tfidf_min_df = gs_clf.best_params_["tfidf__min_df"]
    tfidf_max_df = gs_clf.best_params_["tfidf__max_df"]
    tfidf_stop_words = gs_clf.best_params_["tfidf__stop_words"]

    classifier = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words=tfidf_stop_words,\
                                  min_df=tfidf_min_df, max_df = tfidf_max_df)),
        ('clf', MultinomialNB(alpha = clf_alpha ))])

    clf = classifier.fit(X_train,  y_train)

    labels=sorted(np.unique(feature))
    labels = list(map(str, labels))

    predicted = classifier.predict(X_test)
    

    
    print(classification_report(y_test, predicted))
##########################################
       
    
    ##########################
    
    return predicted


#categ_data = pd.read_csv("consumer_complaints_small.csv" , header = 0)   
categ_data = df
categ_data.head()

#classes = categ_data.label.values
classes = categ_data.Product.values

classes = [i.split(",") for i in classes]
print(classes, )


mlb = MultiLabelBinarizer()
Y=mlb.fit_transform(classes)

classes_label = mlb.classes_

[['Debt collection'], ['Consumer Loan'], ['Mortgage'], ['Mortgage'], ['Mortgage'], ['Mortgage'], ['Mortgage'], ['Mortgage'], ['Credit card'], ['Consumer Loan'], ['Mortgage'], ['Credit card'], ['Consumer Loan'], ['Debt collection'], ['Debt collection'], ['Debt collection'], ['Debt collection'], ['Mortgage'], ['Credit reporting'], ['Student loan'], ['Credit reporting'], ['Credit reporting'], ['Credit reporting'], ['Debt collection'], ['Credit reporting'], ['Credit reporting'], ['Credit reporting'], ['Credit reporting'], ['Debt collection'], ['Credit reporting'], ['Credit reporting'], ['Debt collection'], ['Credit reporting'], ['Debt collection'], ['Mortgage'], ['Consumer Loan'], ['Credit card'], ['Debt collection'], ['Debt collection'], ['Bank account or service'], ['Debt collection'], ['Debt collection'], ['Credit reporting'], ['Debt collection'], ['Debt collection'], ['Mortgage'], ['Credit card'], ['Student loan'], ['Mortgage'], ['Credit card'], ['Consumer Loan'], ['Debt collection'], 

In [10]:
categ_data.rename(columns={'Consumer complaint narrative':'Consumer_complaint_narrative'},inplace =True)

In [11]:
categ_data.head()

Unnamed: 0,Product,Consumer_complaint_narrative
0,Debt collection,XXXX has claimed I owe them {$27.00} for XXXX ...
1,Consumer Loan,Due to inconsistencies in the amount owed that...
2,Mortgage,In XX/XX/XXXX my wages that I earned at my job...
3,Mortgage,I have an open and current mortgage with Chase...
4,Mortgage,XXXX was submitted XX/XX/XXXX. At the time I s...


In [12]:
classes[:5]

[['Debt collection'],
 ['Consumer Loan'],
 ['Mortgage'],
 ['Mortgage'],
 ['Mortgage']]

In [13]:
classes_label

array(['Bank account or service', 'Consumer Loan', 'Credit card',
       'Credit reporting', 'Debt collection', 'Money transfers',
       'Mortgage', 'Other financial service', 'Payday loan',
       'Prepaid card', 'Student loan'], dtype=object)

In [14]:
categ_data['Product'].unique()

array(['Debt collection', 'Consumer Loan', 'Mortgage', 'Credit card',
       'Credit reporting', 'Student loan', 'Bank account or service',
       'Payday loan', 'Money transfers', 'Other financial service',
       'Prepaid card'], dtype=object)

In [15]:
len(Y)

66806

In [16]:
# # ambience = Y[:,0]
# # food = Y[:,2]
# # miscellaneous = Y[:,1]
# # service = Y[:,4]
# # price = Y[:,3]

# Bank_account_or_service = Y[:,0]


Debt_collection = Y[:,0]
Mortgage = Y[:,1]
Credit_reporting = Y[:,2]
Credit_card = Y[:,3]
Bank_account_or_service = Y[:,4]
Consumer_Loan = Y[:,5]
Student_loan = Y[:,6]
Prepaid_card = Y[:,7]
Payday_loan = Y[:,8]
Money_transfers = Y[:,9]
Other_financial_service = Y[:,10]


# Debt collection =             17552
# Mortgage                   14919
# Credit reporting           12526
# Credit card                 7929
# Bank account or service     5711
# Consumer Loan               3678
# Student loan                2128
# Prepaid card                 861
# Payday loan                  726
# Money transfers              666
# Other financial service      110


predicted_Debt_collection =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Debt_collection)
predicted_Mortgage= KNN_topic_classification(categ_data.Consumer_complaint_narrative, Mortgage)
predicted_Credit_reporting =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Credit_reporting)
predicted_Credit_card =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Credit_card)
predicted_Bank_account_or_service =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Bank_account_or_service)


predicted_Consumer_Loan =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Consumer_Loan)
predicted_Student_loan= KNN_topic_classification(categ_data.Consumer_complaint_narrative, Student_loan)
predicted_Prepaid_card =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Prepaid_card)
predicted_Payday_loan =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Payday_loan)
predicted_Money_transfers =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Money_transfers)
predicted_Other_financial_service =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Other_financial_service)





X_train, X_test, y_train, y_test = train_test_split(categ_data.Consumer_complaint_narrative, Y , \
                                                    test_size=0.25, random_state=0)

zip_all = list(zip(predicted_Debt_collection, predicted_Mortgage, predicted_Credit_reporting, predicted_Credit_card, predicted_Bank_account_or_service,predicted_Consumer_Loan,predicted_Student_loan,predicted_Prepaid_card,predicted_Payday_loan,predicted_Money_transfers,predicted_Other_financial_service))

print(classification_report\
      (y_test, np.array(list(zip_all)), target_names=mlb.classes_)) 

NameError: name 'KNN_topic_classification' is not defined

In [None]:
print(classification_report\
      (y_test, np.array(list(zip_all)), target_names=mlb.classes_)) 

In [None]:
zip_all

In [None]:
best_param

In [None]:
len(categ_data.Consumer_complaint_narrative)

In [None]:
gs_clf

In [None]:
clf_k = gs_clf.best_params_["clf__n_neighbors"]
    tfidf_min_df = gs_clf.best_params_["tfidf__min_df"]
    tfidf_stop_words = gs_clf.best_params_["tfidf__stop_words"]

# After finding best parameter, estimate the accuracy

In [None]:
def KNN_topic_classification(review_data, feature):
    

    X_train, X_test, y_train, y_test = train_test_split(review_data, feature , \
                                                        test_size=0.25, random_state=0)

    text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', KNeighborsClassifier())])
    
    #k_range = list(range(1, 10))
    #k_range = list(range(3))

#     parameters = {'tfidf__min_df':[1, 2, 3],
#                   'tfidf__stop_words':[None,"english"],
#                   'clf__n_neighbors': k_range}

#     parameters = {'tfidf__min_df':[1,3,5],
#                   'tfidf__stop_words':[None, "english"],
#                   'clf__n_neighbors': k_range}

    parameters = {'tfidf__min_df':[3],
                   'tfidf__stop_words':[None],
                   'clf__n_neighbors': [3,5]}

    metric =  "f1_macro"

    gs_clf = GridSearchCV(text_clf, param_grid=parameters, scoring=metric, cv=2)


    gs_clf = gs_clf.fit(X_train, y_train)

    for param_name in gs_clf.best_params_:
        print("{} : {}".format(param_name , gs_clf.best_params_[param_name]))
    print("best f1 score:", gs_clf.best_score_)

    clf_k = gs_clf.best_params_["clf__n_neighbors"]
    tfidf_min_df = gs_clf.best_params_["tfidf__min_df"]
    tfidf_stop_words = gs_clf.best_params_["tfidf__stop_words"]

    classifier = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words=tfidf_stop_words,\
                                  min_df=tfidf_min_df)),
        ('clf', KNeighborsClassifier(n_neighbors= clf_k))])

    clf = classifier.fit(X_train,  y_train)

    labels=sorted(np.unique(feature))
    labels = list(map(str, labels))

    predicted = classifier.predict(X_test)
    
    return predicted


#categ_data = pd.read_csv("consumer_complaints_small.csv" , header = 0)   
categ_data = df
categ_data.head()

#classes = categ_data.label.values
classes = categ_data.Product.values

classes = [i.split(",") for i in classes]
print(classes, )


mlb = MultiLabelBinarizer()
Y=mlb.fit_transform(classes)

classes_label = mlb.classes_

In [None]:
categ_data.rename(columns={'Consumer complaint narrative':'Consumer_complaint_narrative'},inplace =True)

In [None]:
categ_data.head()

In [None]:
classes[:5]

In [None]:
classes_label

In [None]:
categ_data['Product'].unique()

In [None]:
len(Y)

In [None]:
# # ambience = Y[:,0]
# # food = Y[:,2]
# # miscellaneous = Y[:,1]
# # service = Y[:,4]
# # price = Y[:,3]


# Bank_account_or_service = Y[:,0]


Debt_collection = Y[:,0]
Mortgage = Y[:,1]
Credit_reporting = Y[:,2]
Credit_card = Y[:,3]
Bank_account_or_service = Y[:,4]
Consumer_Loan = Y[:,5]
Student_loan = Y[:,6]
Prepaid_card = Y[:,7]
Payday_loan = Y[:,8]
Money_transfers = Y[:,9]
Other_financial_service = Y[:,10]


# Debt collection =             17552
# Mortgage                   14919
# Credit reporting           12526
# Credit card                 7929
# Bank account or service     5711
# Consumer Loan               3678
# Student loan                2128
# Prepaid card                 861
# Payday loan                  726
# Money transfers              666
# Other financial service      110


predicted_Debt_collection =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Debt_collection)
predicted_Mortgage= KNN_topic_classification(categ_data.Consumer_complaint_narrative, Mortgage)
predicted_Credit_reporting =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Credit_reporting)
predicted_Credit_card =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Credit_card)
predicted_Bank_account_or_service =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Bank_account_or_service)


predicted_Consumer_Loan =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Consumer_Loan)
predicted_Student_loan= KNN_topic_classification(categ_data.Consumer_complaint_narrative, Student_loan)
predicted_Prepaid_card =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Prepaid_card)
predicted_Payday_loan =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Payday_loan)
predicted_Money_transfers =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Money_transfers)
predicted_Other_financial_service =  KNN_topic_classification(categ_data.Consumer_complaint_narrative, Other_financial_service)





X_train, X_test, y_train, y_test = train_test_split(categ_data.Consumer_complaint_narrative, Y , \
                                                    test_size=0.25, random_state=0)

zip_all = list(zip(predicted_Debt_collection, predicted_Mortgage, predicted_Credit_reporting, predicted_Credit_card, predicted_Bank_account_or_service,predicted_Consumer_Loan,predicted_Student_loan,predicted_Prepaid_card,predicted_Payday_loan,predicted_Money_transfers,predicted_Other_financial_service))

print(classification_report\
      (y_test, np.array(list(zip_all)), target_names=mlb.classes_)) 

In [None]:
print(classification_report\
      (y_test, np.array(list(zip_all)), target_names=mlb.classes_)) 

In [None]:
zip_all