In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import PassiveAggressiveClassifier
from xgboost import XGBClassifier
import eli5
import pickle


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

># **Different Stages of Data Analysis**
>
>***
>
> ## **1. Data Exploration**
This is the first step for creating any model. In this, we perform initial analysis to check the number of features available. Among given features which are important for creating model and which we can ignore.     
>
>## **2. Data Cleaning** 
In this step, we either remove or apply correction to record's data that have NULL values or some inconsistent value. 
>
>## **3. Data Preprocessing**
In this step, we process the records in such a way that they can be used as input by the model creation algorithm.
>
> Eg. Converting non-numeric data into numeric data.
>
>## **4. Model Preparation** 
>   In this step, we select a model and train our data on it. 
>
>## **5. Model Evaluation** 
In this step, we make predicions using the trained model and use confusion matrix for evaluating its performance. We choose the model with the best evaluation stats.



In [None]:
# Loading the dataset
data = pd.read_csv('../input/bank-dataset/bank_dataset.csv')

 ### **Exploring the data**
 
 ***

In [None]:
print(data.head())

In [None]:
print("\nNumber of dimensions in dataset : ", data.ndim)
print("\nDimensions of dataset : ", data.shape)
print("\nNumber of Features in the dataset: ", len(data.columns))
print("\nList of features in the dataset: ")

for i in range(len(data.columns)):
    print(end = "\t")
    print(str(i + 1)+". "+data.columns[i])
    
print("\nNumber of elements in each feature of dataset:\n", data.count()) #To get an idea if there are any missing values.
print("\n\nCount of NaN values in each column:\n ",data.isna().sum())  #Counting number of missing values in each column

### **Cleaning the data**

___

In [None]:
# Renaming Columns

data.columns = [col.strip() for col in data.columns]
data.columns = [col.replace('-',"_") for col in data.columns]
data.columns = [col.replace(' ',"_") for col in data.columns]
data.columns = [col.title() for col in data.columns]

data.rename(columns = {'Consumer_Complaint_Narrative':'Complaint', 'Company_Public_Response':'Response'}, inplace = True)
print(data.columns)


In [None]:
# Counting number of missing values in each column

print("\n\nCount of NaN values in each column:\n ",data.isna().sum())

In [None]:
# Selecting records having registered complaints only

data = data[data['Complaint'].notnull()]
print(data.shape)

### **Preprocessing the data**

***

In [None]:
# Converting registered Complaint into lowercase string

data['Complaint'] = data['Complaint'].astype(str)

data['Complaint'] = data['Complaint'].str.lower()

In [None]:
# Identifying unique Products

unique_ele = data['Product'].unique()
print("Number of unique elements in Product : ", len(unique_ele))
print("\n\nUnique elements in Product : ", unique_ele)

In [None]:
# Number of Complaints in each Product

temp_series = data.groupby('Product').size()
print(temp_series)

temp_series.sort_values(ascending = False, inplace = True)
print("\nAfter sorting in Descending Order :\n\n",temp_series)

In [None]:
# Selecting the Product for Analysis

# product_list = list(temp_series[2:7].index)  # Considering only 5 products with comparable numbers.
# print("\n The Product list is : ", product_list)

# data = data.loc[data['Product'].isin(product_list)]
length = len(data['Product'])
data.index = [np.arange(0,length)]
print("\n The dimensions of dataset after filering out the Product list : ", data.shape)
# print("\n\n The Dataset after filering out the Product list\n\n", data.head())



#### There are 3 approaches to convert Categorical values to Numerical figures
>   **1. Label-Encoding** (We are using this here.) : It replaces each categorical value with a unique number. It can only be applied to target variable otherwise when used on training variables , it may cause the algorithm to identify some relation between the different categorical values. 
>
>   **2. One-Hot-Encoding :** It can only be applied to training variables to create new feature vector for each categorical value and similar functinality is provided by *pandas.get_dummies*. Its advantage is that the algorithm doesn't form any relatioship between categorical values.
>
>   **3. Find & Replace** (Find Categorical Values & replace them with a Numerical value.)

In [None]:
# Creating LabelEncoder object 

encoder = LabelEncoder()
data['Product_Encoding'] = encoder.fit_transform(data['Product'])
print(data.head())
# We will use encoder.inverse_transform(df['Product_Encoding'] to get back corresponding Product
# We can use encoder.classes_ to get list of products.

>## Some Important Terms of Natural Language Processing (NLP)
> - **Lemma :** The base form of a word is referred to as lemma. Eg: "playing", "played", "play" have the common lemma "play".
>
> - **Token :**  A token is a string of contiguous characters between two spaces, or between a space and punctuation marks.
>
> - **Document :** Document refers to a body of text. A collection of documents make up a corpus. Eg. Here in our dataset, each record in "Complaint" field is a document
>
> - **Corpus :** A corpus refers to a collection of text.Eg. Here in our dataset, the entire "Complaint" field is corpus. 
>
> - **Stopwords :**  Stopwords are those words which contribute little to overall meaning of the document. They are generally the most common words in a language. For instance, "the", "and", and "a" are stopwords of English language.
>
> - **Vocabulary :** The entire set of terms used in a body of text.
>
> ### Normalization Techniques
>>1.  **Lemmatization :** Lemmatization is the process of reducing a word to its base form using 
>>2.  **Stemmimg :** Stemming is the process of reducing a word to its word stem (i.e. base form or lemma) by removing affixes (suffixes and prefixes).

In [None]:
# Tokenizing the complaints & performing lemmatization on each token

lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))  # Creating stopwords object

data['Filtered_Text'] = data['Complaint'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split() if word not in stop_words]))
print(data.head())

> - ### Train-Test Splitting :
>      It is used for dividing the dataset into training and testing sets.

In [None]:
# Dividing the Dataset into Testing and Training set.

target = data['Product_Encoding']
X = data.drop(columns = ['Unnamed:_0','Product','Product_Encoding'])
print("\n List of Input : ",X.columns)
# print("\n Target : ",target.name)

X_train, X_test, target_train, target_test = train_test_split(X, target, test_size = 0.30, random_state = 25)
print("\nOriginal Number of records in processed dataset:", len(X))
print("\nNumber of records in training features for training and testing purpose resp. are :",len(X_train), len(X_test), sep = ' ')
print("\nNumber of records in target feature for training and testing purpose resp. are :",len(target_train), len(X_test), sep = ' ')

> - ### TF-IDF Vectorizer
>It stands for Term Frequency * Inverse Document Frequency Vectorizer. It is used to create a vector of words ( based on ***Bag of Words*** Model)  where low value of TF-IDF for a word indicates it occurs very commonly whereas a high value indicates a rare word. 
>
>>**Term Frequency** is defined as the count of a term in a document.
>>
>>**Inverse Document Frequency** is defined as the total number of documents divided by the number of documents containing the word.
>

In [None]:
# Initializing vectorizer & transforming the text

vectorizer = TfidfVectorizer(max_features = 500)  # Selecting top 5000 uncommon words only
Y = vectorizer.fit_transform(X_train['Filtered_Text'])

pickle.dump(vectorizer, open("temp_vectorizer.pickle", "wb"))

print("Shape : ", Y.shape)

#print(Y)



In [None]:
Z = vectorizer.transform(X_test['Filtered_Text'].T) # Transposing the transformed vectorizer
print("Shape : ", Z.shape)

### **Training models & evaluating their performance**

***

In [None]:
# Training decision tree classifier
model_type_1 = DecisionTreeClassifier(criterion = 'gini', max_depth = 10, random_state = 20) #There are 2 criteria : 'gini' or 'entropy'
model = model_type_1.fit(Y,target_train)
output = model.predict(Z)
confusion_matrix(target_test, output)
# accuracy_score(target_test,output)
print("\n Accuracy Score : ",accuracy_score(target_test,output))


In [None]:
W = vectorizer.transform([data['Filtered_Text'][4]])
ans = model.predict(W)
print(ans)

filename = 'temp_pickle_dtc.sav'
pickle.dump(model, open(filename, "wb")) # write in binary format, not readable

In [None]:
pd.Series(target_test).value_counts()

In [None]:
# Training Random Forest classifier
model_type_2 = RandomForestClassifier(n_estimators = 10, random_state = 20)
model = model_type_2.fit(Y,target_train)
output = model.predict(Z)
print(confusion_matrix(target_test, output))
# accuracy_score(target_test,output)
print("\n Accuracy Score : ",accuracy_score(target_test,output))


In [None]:
filename = 'temp_pickle_rfc.sav'
pickle.dump(model, open(filename, "wb")) # write in binary format, not readable

In [None]:
# Training using Naive Bayes classifier

model_type_3= GaussianNB()
model = model_type_3.fit(Y.toarray(),target_train)
output = model.predict(Z.toarray())
print(confusion_matrix(target_test, output))
# accuracy_score(target_test,output)
print("\n Accuracy Score : ",accuracy_score(target_test,output))


In [None]:
filename = 'temp_pickle_nbc.sav'
pickle.dump(model, open(filename, "wb")) # write in binary format, not readable

In [None]:
# Training using Passive Aggressive classifier


model_type_4= PassiveAggressiveClassifier()
model = model_type_4.fit(Y,target_train)
output = model.predict(Z)
print(confusion_matrix(target_test,output))
# accuracy_score(target_test,output)
print("\n Accuracy Score : ",accuracy_score(target_test,output))


In [None]:
W = vectorizer.transform([data['Filtered_Text'][4]])
ans = model.predict(W)
print(ans)

filename = 'temp_pickle_pac.sav'
pickle.dump(model, open(filename, "wb")) # write in binary format, not readable
print('Model dumped successfully into a file by Pickle.....\n')

In [None]:
# Training using XGBoost classifier


model_type_5= XGBClassifier(max_depth = 7, n_estimators = 50)
model = model_type_5.fit(Y,target_train)
output = model.predict(Z)
print(confusion_matrix(target_test,output))
print("\n Accuracy Score : ",accuracy_score(target_test,output))



In [None]:
print(model)
import pickle
filename = 'temp_pickle_xgboost.sav'
pickle.dump(model, open(filename, "wb")) # write in binary format, not readable
print('Model dumped successfully into a file by Pickle.....\n')

> ### Analysing the Predictions

In [None]:
# Creating a new dataframe with testing set having predicted values.

New_output = pd.DataFrame(X_test.copy(deep = True))
New_output['Actual_Class'] = target_test
New_output['Predicted_Class'] = output

New_output.index = np.arange(1, New_output.shape[0] + 1)

# print(New_output.head())
print("Shape : ",New_output.shape)

In [None]:
# Selecting those records having differing actual and predicted output

data_2 = New_output.loc[New_output['Actual_Class'] != New_output['Predicted_Class']]
print("\nShape of data whose Actual & predicted class differs = " + str(data_2.shape))


data_2 = data_2.groupby('Actual_Class').head()
print("\n\nTaking 5 samples of each wrongly predicted complaint" + str(data_2.shape))

print(data_2[['Actual_Class', 'Predicted_Class']])
# print(data_2[['Complaint','Actual_Class', 'Predicted_Class']])

In [None]:
# Analysing top 5 complaints to understand why our model is making incorrect predictions

s1 = data_2['Complaint'].head(5)
for i in range(len(s1)):
    print(s1.iloc[i], end = "\n"*4)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 5000)

In [None]:
# Analysing top 5 complaints to understand how our model is making correct predictions

correct_predicted_complaint = New_output.loc[ New_output['Actual_Class'] == New_output['Predicted_Class'] & (New_output['Actual_Class'] == 0) ]
data_3 = correct_predicted_complaint['Complaint'][0:5]
data_3.index = np.arange(0, len(data_3))
print(data_3)


In [None]:
# Visualizing our models predictions

eli5.show_weights(model,vec = vectorizer, top = 10)  #This is used to inspect model parameters
# print(eli5.show_weights(model,vec = vectorizer, top = 10)  #This is used to inspect model parameters)



In [None]:
eli5.show_prediction(model, vec = vectorizer, doc =  s1.iloc[0]) #This explains how the model makes individual predictions.


In [None]:
encoder.inverse_transform([0,1,2,3,4]) # To understand what the 'Predicted_Class' code represents.

In [None]:
eli5.show_prediction(model, vec = vectorizer, doc = data_3.iloc[0])  #Visualizing how our model predicts for 1st complaint instance.

   ###    0: Checking or savings account
   ###    1: Credit card or prepaid card
   ###    2: Credit reporting
   ###    3: Mortgage
   ###    4: Student loan