# Importing Libraries and Data

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm

In [None]:
dataset = pd.read_csv("drive/MyDrive/Colab/complaints.csv", usecols=["Product", "Consumer complaint narrative", "Date received"])
print(dataset.shape)

# Analysis

From Data Analysis, we know that over the last 3 years complains are registered for only 9 categories, so we will make classification for only 9 classes.

1. Payday loan, title loan, or personal loan
2. Checking or savings account
3. Credit reporting, credit repair services, or other personal consumer reports
4. Credit card or prepaid card
5. Money transfer, virtual currency, or money service
6. Mortgage
7. Debt collection
8. Vehicle loan or lease
9. Student loan

In [None]:
data = dataset[dataset["Date received"] >= "2019-01-01"]
data = data[data['Date received'] < "2022-01-01"]
print(data.shape)

Since the dataset for last 3 years have 1 million entries, we will start with very small dataset and work our way up.

In [None]:
data = dataset[dataset["Date received"] >= "2021-12-01"]
data = data[data['Date received'] < "2022-01-01"]
print(data.shape)

In [None]:
data = data[["Product", "Consumer complaint narrative"]]

In [None]:
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)
data.shape

In [None]:
data.reset_index(drop=True, inplace=True)

In [None]:
data["Product"].value_counts()

In [None]:
data["Product"].value_counts()[::-1].plot(title="Complains", kind="barh")
plt.show()

In [None]:
import nltk
nltk.download('stopwords')
nltk.download("punkt")

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
stop_words = stopwords.words("english")
stop_words.append("xxxx")

In [None]:
processed_text = []
for text in tqdm(data["Consumer complaint narrative"]):
  text = [word for word in word_tokenize(text.lower()) if (word not in stop_words and word.isalpha())]
  processed_text.append(text)

### Finding Most Used Words for a Product

In [None]:
product_id_df = data.copy()

In [None]:
temp = []
for text in tqdm(processed_text):
  text = " ".join(text)
  temp.append(text)

In [None]:
product_id_df["Consumer complaint narrative"] = temp

In [None]:
product_id_df.drop_duplicates(inplace=True)

In [None]:
product_id_df.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10, ngram_range=(1, 2), lowercase=False)

In [None]:
features = tfidf.fit_transform(product_id_df["Consumer complaint narrative"])

In [None]:
features = features.toarray()

In [None]:
labels = product_id_df["Product"]

In [None]:
from sklearn.feature_selection import chi2

In [None]:
N = 3
for product in product_id_df["Product"].unique():
  features_chi2 = chi2(features, labels == product)
  indices = np.argsort(features_chi2[0])
  top_features = tfidf.get_feature_names_out()[indices]
  unigrams = [v for v in top_features if len(v.split(' ')) == 1]
  biigrams = [v for v in top_features if len(v.split(' ')) == 2]
  print("==>{}:".format(product))
  print(" Top {} unigrams".format(N), unigrams[-N:])
  print(" Top {} biigrams".format(N), biigrams[-N:])

In [None]:
from wordcloud import WordCloud

In [None]:
for product in data["Product"].unique():
  text = " ".join(list(product_id_df["Consumer complaint narrative"][product_id_df["Product"]==product]))
  wordcloud = WordCloud().generate(text)
  plt.imshow(wordcloud)
  plt.title(product)
  plt.show()

# Creating Classification Model based on Narrative

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, train_size=0.8)

In [None]:
from sklearn.naive_bayes import GaussianNB
nvb_model = GaussianNB()

In [None]:
nvb_model.fit(x_train, y_train)

In [None]:
nvb_model.score(x_test, y_test)

In [None]:
from sklearn.svm import LinearSVC
svm_model = LinearSVC()
svm_model.fit(x_train, y_train)

In [None]:
svm_model.score(x_test, y_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
nvb_report = classification_report(y_test, nvb_model.predict(x_test), target_names=data["Product"].unique())
print(nvb_report)

In [None]:
svm_report =  classification_report(y_test, svm_model.predict(x_test))
print(svm_report)

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [None]:
nvb_conf_mat = confusion_matrix(y_test, nvb_model.predict(x_test))
sns.heatmap(nvb_conf_mat, annot=True,  fmt='d', xticklabels=data["Product"].unique(), yticklabels=data["Product"].unique())
plt.ylabel("Actual")
plt.xlabel("Predicted")

In [None]:
svm_conf_mat = confusion_matrix(y_test, svm_model.predict(x_test))
sns.heatmap(svm_conf_mat, annot=True,  fmt='d', xticklabels=data["Product"].unique(), yticklabels=data["Product"].unique())
plt.ylabel("Actual")
plt.xlabel("Predicted")

**This is very simple model trained on Naive Bayes and SVM.**
Classification problem can be done in mary different ways.
1. This is imbalanced multiclassification data. So, Crossvalidation, Stratified Cross Validation should be applied.
2. Different type of embeddings i.e. Word embedding, Sentence embedding.
3. Deep Learning can be used to create classification models.
4. Pre-trained Embedding and Pre-trained Models can be used.

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
CV = 5
nvb_accuracies = cross_val_score(GaussianNB(), features, labels, scoring="accuracy", cv=CV)

In [None]:
svm_accuracies = cross_val_score(LinearSVC(), features, labels, scoring="accuracy", cv=CV)

In [None]:
from sklearn.linear_model import LogisticRegression
lr_accuracies = cross_val_score(LogisticRegression(), features, labels, scoring="accuracy", cv=CV)

In [None]:
print("Naive bayes:\n     Accuracy={}\n     Std.Dev ={}".format(nvb_accuracies.mean(), nvb_accuracies.std()))

In [None]:
print("SVM:\n     Accuracy={}\n     Std.Dev ={}".format(svm_accuracies.mean(), svm_accuracies.std()))

In [None]:
print("Logistic Regression:\n     Accuracy={}\n     Std.Dev ={}".format(lr_accuracies.mean(), lr_accuracies.std()))

In [None]:
import pickle
pickle.dump(svm_model, open("drive/MyDrive/Colab/classification_model","wb"))

In [None]:
pickle.dump(tfidf, open("drive/MyDrive/Colab/vectorizer","wb"))

We trained model with 3 classification algorithm. SVM takes lesser time and provide higher accuracy than other two. <br>LR took very very long with lesser accuracy.