# Importing Libraries and Dataset

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
dataset = pd.read_csv("drive/MyDrive/Colab/complaints.csv")

In [None]:
dataset.columns

# Analysis

Data Analysis provided the insight that the no dispute data is available for the last 3 years. 

In [None]:
Target_Column = "Consumer disputed?"

In [None]:
columns_to_be_dropped = ["Complaint ID"]

In [None]:
dataset['Consumer disputed?'].value_counts()

In [None]:
dataset['Consumer disputed?'].isna().sum()

In [None]:
data = dataset.copy()

In [None]:
indices = data["Consumer disputed?"].dropna().index

In [None]:
data = data.loc[indices].reset_index(drop=True)

In [None]:
data.isna().sum()

### Checking Delay in Complain Moving Forward

In [None]:
data["Date received"] = pd.to_datetime(data["Date received"], infer_datetime_format=True)
data["Date sent to company"] = pd.to_datetime(data["Date sent to company"], infer_datetime_format=True)

In [None]:
sum(data["Date received"] != data["Date sent to company"])

In [None]:
data["Day Difference"] = pd.to_numeric((data["Date sent to company"] - data["Date received"])/np.timedelta64(1, 'D'))

In [None]:
columns_to_be_dropped.extend(["Date received", "Date sent to company"])

### Checking Textual Data

In [None]:
for col in data.columns:
  print(col, len(data[col].unique()))

In [None]:
"""
for col in ["Consumer complaint narrative", "Sub-issue", "Sub-product", "Consumer consent provided?", "Company public response"]:
  varname = data[col]+"_isnan"
  data[varname] = np.where(pd.isnull(data[col]),1,0)
  columns_to_be_dropped.append(col)
"""

In [None]:
for col in ["Consumer complaint narrative", "Sub-issue", "Sub-product", "Consumer consent provided?", "Company public response"]:
  columns_to_be_dropped.append(col)

In [None]:
columns_to_be_dropped.append("ZIP code")
columns_to_be_dropped.append("Company")

### Checking Products

In [None]:
dataset["Product"].unique()

In [None]:
data["Product"].unique()

In [None]:
products_dropped = ["Consumer Loan", "Bank account or service", "Credit reporting", "Credit card", 
                    "Other financial service", "Money transfers", "Payday loan", "Prepaid card", "Virtual currency"]

for product in products_dropped:
  data.drop(data[data["Product"]==product].index, inplace=True)

del product, products_dropped

In [None]:
data["Product"].unique()

### Checking Whether Tags affect Dispute

In [None]:
data["Tags"].value_counts()

In [None]:
data["Tags"].isna().sum()

In [None]:
data["Tags"].isna().sum()/len(data)

In [None]:
data[["Tags", Target_Column]].value_counts()

In [None]:
for tag in data["Tags"].unique():
  try:
    print(tag,"Yes", len(data[(data["Tags"]==tag) & (data[category]=="Yes")])/len(data[data["Tags"]==tag]))
    print(tag,"No",len(data[(data["Tags"]==tag) & (data[category]=="No")])/len(data[data["Tags"]==tag]))
  except:
    print("Nan is not a tag thus giving Error")
del tag

Consumer Who disputed has no relation to tag column. As each have 80% (approx) chance of dispute. Even 85.88% data is missing.<br>
Since, this is also not a mandatory column, we will drop it.

In [None]:
columns_to_be_dropped.append("Tags")

### Checking For Issues

In [None]:
data["Issue"].value_counts()[:20]

In [None]:
for issue in tqdm(data["Issue"].value_counts().index[:10]):
  varname = "Issue_{}".format(issue)
  data[varname] = np.where(data["Issue"]==issue, 1, 0)
del varname

In [None]:
columns_to_be_dropped.append("Issue")

### Checking for State

In [None]:
data["State"].value_counts()[:20]

In [None]:
for state in tqdm(data["State"].value_counts().index[:10]):
  varname = "State_{}".format(state)
  data[varname] = np.where(data["State"]==state, 1, 0)
del varname

In [None]:
columns_to_be_dropped.append("State")

### Categorizing all the columns Left

In [None]:
for col in ["Product", "Submitted via", "Company response to consumer", "Timely response?"]:
  temp = pd.get_dummies(data[col], drop_first=True, prefix=col)
  data = pd.concat([temp, data], axis=1)
  columns_to_be_dropped.append(col)
  del temp

In [None]:
data.info()

# Training Model

In [None]:
target = data[Target_Column]
features = data.drop(columns=columns_to_be_dropped)
features.drop(columns=[Target_Column], inplace=True)

In [None]:
features.drop(columns="Day Difference", inplace=True)

### Splitting Data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features, target, train_size=0.8)

In [None]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

### Training

#### Naive Bayes

In [None]:
nvb_model = GaussianNB()
nvb_model.fit(x_train, y_train)

In [None]:
print("Training Accuracy:",nvb_model.score(x_train, y_train))
print("Testing Accuracy:",nvb_model.score(x_test, y_test))

#### SVM

In [None]:
svm_model = LinearSVC()
svm_model.fit(x_train, y_train)

In [None]:
print("Training Accuracy:", svm_model.score(x_train, y_train))
print("Testing Accuracy:",svm_model.score(x_test, y_test))

#### Logistic Regression

In [None]:
lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)

In [None]:
print("Training Accuracy:", lr_model.score(x_train, y_train))
print("Testing Accuracy:",lr_model.score(x_test, y_test))

#### Decision Tree

In [None]:
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(x_train, y_train)

In [None]:
print("Training Accuracy:", decision_tree_model.score(x_train, y_train))
print("Testing Accuracy:",decision_tree_model.score(x_test, y_test))

### Results

In [None]:
y_pred = nvb_model.predict(x_test)

In [None]:
nvb_conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(nvb_conf_mat, annot=True,  fmt='d', xticklabels=data[Target_Column].unique(), yticklabels=data[Target_Column].unique())
plt.ylabel("Actual")
plt.xlabel("Predicted")

In [None]:
nvb_report = classification_report(y_test, y_pred, target_names=data[Target_Column].unique())
print(nvb_report)

In [None]:
y_pred = svm_model.predict(x_test)

In [None]:
svm_conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(svm_conf_mat, annot=True,  fmt='d', xticklabels=data[Target_Column].unique(), yticklabels=data[Target_Column].unique())
plt.ylabel("Actual")
plt.xlabel("Predicted")

In [None]:
svm_report = classification_report(y_test, y_pred, target_names=data[Target_Column].unique())
print(svm_report)

In [None]:
y_pred = lr_model.predict(x_test)

In [None]:
lr_conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(lr_conf_mat, annot=True,  fmt='d', xticklabels=data[Target_Column].unique(), yticklabels=data[Target_Column].unique())
plt.ylabel("Actual")
plt.xlabel("Predicted")

In [None]:
lr_report = classification_report(y_test, y_pred, target_names=data[Target_Column].unique())
print(lr_report)

In [None]:
y_pred = decision_tree_model.predict(x_test)

In [None]:
decision_tree_conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(decision_tree_conf_mat, annot=True,  fmt='d', xticklabels=data[Target_Column].unique(), yticklabels=data[Target_Column].unique())
plt.ylabel("Actual")
plt.xlabel("Predicted")

In [None]:
decision_tree_report = classification_report(y_test, y_pred, target_names=data[Target_Column].unique())
print(decision_tree_report)

### **Observations**
1. SVM and Logistic regression does not identify even a single case "Yes" class. They are very much biased to majority class so that can't be considered for models. 
2. Except Naive Bayes which is 0.86, All the models show same 0.76 score for false negative recall and accuracy of 80% approx. 
3. Decision Tree is the only that can be considered as it has good accuracy, recall and still able to identfy handful of minority cases

# Re-Training Models

In [None]:
data.drop(columns=columns_to_be_dropped, inplace=True)

In [None]:
data.drop(columns="Day Difference", inplace=True)

### Balancing Data

In [None]:
temp_yes_data = data[data["Consumer disputed?"]=="Yes"]
temp_no_data = data[data["Consumer disputed?"]=="No"].sample(len(temp_yes_data))

In [None]:
print(temp_yes_data.shape)
print(temp_no_data.shape)

In [None]:
training_data = pd.concat([temp_yes_data, temp_no_data], axis=0)

In [None]:
train_labels = training_data[Target_Column]
training_data.drop(columns=Target_Column, inplace=True)
train_features = training_data

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, train_size=0.75)

In [None]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

### Training

In [None]:
nvb_model = GaussianNB()
nvb_model.fit(x_train, y_train)
nvb_model.score(x_train, y_train)

In [None]:
svm_model = LinearSVC()
svm_model.fit(x_train, y_train)
svm_model.score(x_train, y_train)

In [None]:
lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)
lr_model.score(x_train, y_train)

In [None]:
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(x_train, y_train)
decision_tree_model.score(x_train, y_train)

### Validating

In [None]:
nvb_conf_mat = confusion_matrix(y_test, nvb_model.predict(x_test))
print("Testing Accuracy;", nvb_model.score(x_test, y_test))
print(nvb_conf_mat)

In [None]:
svm_conf_mat = confusion_matrix(y_test, svm_model.predict(x_test))
print("Testing Accuracy;", svm_model.score(x_test, y_test))
print(svm_conf_mat)

In [None]:
lr_conf_mat = confusion_matrix(y_test, lr_model.predict(x_test))
print("Testing Accuracy;", lr_model.score(x_test, y_test))
print(lr_conf_mat)

In [None]:
decision_tree_conf_mat = confusion_matrix(y_test, decision_tree_model.predict(x_test))
print("Testing Accuracy;", decision_tree_model.score(x_test, y_test))
print(decision_tree_conf_mat)

### Testing

In [None]:
train_labels = data[Target_Column]
temp = data.copy()
temp.drop(columns=Target_Column, inplace=True)
train_features = temp

In [None]:
y_pred = nvb_model.predict(train_features)

In [None]:
nvb_report = classification_report(train_labels, y_pred, target_names=data[Target_Column].unique())
print(nvb_report)

In [None]:
nvb_conf_mat = confusion_matrix(train_labels, y_pred)
sns.heatmap(nvb_conf_mat, annot=True,  fmt='d', xticklabels=data[Target_Column].unique(), yticklabels=data[Target_Column].unique())
plt.ylabel("Actual")
plt.xlabel("Predicted")

In [None]:
y_pred = svm_model.predict(train_features)

In [None]:
svm_report = classification_report(train_labels, y_pred, target_names=data[Target_Column].unique())
print(svm_report)

In [None]:
svm_conf_mat = confusion_matrix(train_labels, y_pred)
sns.heatmap(svm_conf_mat, annot=True,  fmt='d', xticklabels=data[Target_Column].unique(), yticklabels=data[Target_Column].unique())
plt.ylabel("Actual")
plt.xlabel("Predicted")

In [None]:
y_pred = lr_model.predict(train_features)

In [None]:
lr_report = classification_report(train_labels, y_pred, target_names=data[Target_Column].unique())
print(lr_report)

In [None]:
lr_conf_mat = confusion_matrix(train_labels, y_pred)
sns.heatmap(lr_conf_mat, annot=True,  fmt='d', xticklabels=data[Target_Column].unique(), yticklabels=data[Target_Column].unique())
plt.ylabel("Actual")
plt.xlabel("Predicted")

In [None]:
y_pred = decision_tree_model.predict(train_features)

In [None]:
decision_tree_report = classification_report(train_labels, y_pred, target_names=data[Target_Column].unique())
print(decision_tree_report)

In [None]:
decision_tree_conf_mat = confusion_matrix(train_labels, y_pred)
sns.heatmap(decision_tree_conf_mat, annot=True,  fmt='d', xticklabels=data[Target_Column].unique(), yticklabels=data[Target_Column].unique())
plt.ylabel("Actual")
plt.xlabel("Predicted")

### **Observations**

1. Training accuracy is increased by 15% approximately. 
2. All are giving recall score above 83%. 
3. Drop in accuracy of all except Naive Bayes model reaching maximum of 56%.
4. Model is able to predict both majority and minority classes.

# Re-Training 2 Models

In [None]:
#data.drop(columns=columns_to_be_dropped, inplace=True)
#data.drop(columns="Day Difference", inplace=True)

### Balancing Ratio

In [None]:
train_size = int(len(data[data["Consumer disputed?"]=="Yes"])*0.8)
train_size

In [None]:
temp_yes_data = data[data["Consumer disputed?"]=="Yes"].sample(train_size)
temp_no_data = data[data["Consumer disputed?"]=="No"].sample(2*len(temp_yes_data))

In [None]:
print(temp_yes_data.shape)
print(temp_no_data.shape)

In [None]:
training_data = pd.concat([temp_yes_data, temp_no_data], axis=0)

In [None]:
train_labels = training_data[Target_Column]
training_data.drop(columns=Target_Column, inplace=True)
train_features = training_data

### Training

In [None]:
nvb_model = GaussianNB()
nvb_model.fit(train_features, train_labels)
nvb_model.score(train_features, train_labels)

In [None]:
svm_model = LinearSVC()
svm_model.fit(train_features, train_labels)
svm_model.score(train_features, train_labels)

In [None]:
lr_model = LogisticRegression()
lr_model.fit(train_features, train_labels)
lr_model.score(train_features, train_labels)

In [None]:
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(train_features, train_labels)
decision_tree_model.score(train_features, train_labels)

### Testing

In [None]:
train_labels = data[Target_Column]
temp = data.copy()
temp.drop(columns=Target_Column, inplace=True)
train_features = temp

In [None]:
y_pred = nvb_model.predict(train_features)

In [None]:
nvb_report = classification_report(train_labels, y_pred, target_names=data[Target_Column].unique())
print(nvb_report)

In [None]:
nvb_conf_mat = confusion_matrix(train_labels, y_pred)
sns.heatmap(nvb_conf_mat, annot=True,  fmt='d', xticklabels=data[Target_Column].unique(), yticklabels=data[Target_Column].unique())
plt.ylabel("Actual")
plt.xlabel("Predicted")

In [None]:
y_pred = svm_model.predict(train_features)

In [None]:
svm_report = classification_report(train_labels, y_pred, target_names=data[Target_Column].unique())
print(svm_report)

In [None]:
svm_conf_mat = confusion_matrix(train_labels, y_pred)
sns.heatmap(svm_conf_mat, annot=True,  fmt='d', xticklabels=data[Target_Column].unique(), yticklabels=data[Target_Column].unique())
plt.ylabel("Actual")
plt.xlabel("Predicted")

In [None]:
y_pred = lr_model.predict(train_features)

In [None]:
lr_report = classification_report(train_labels, y_pred, target_names=data[Target_Column].unique())
print(lr_report)

In [None]:
lr_conf_mat = confusion_matrix(train_labels, y_pred)
sns.heatmap(lr_conf_mat, annot=True,  fmt='d', xticklabels=data[Target_Column].unique(), yticklabels=data[Target_Column].unique())
plt.ylabel("Actual")
plt.xlabel("Predicted")

In [None]:
y_pred = decision_tree_model.predict(train_features)

In [None]:
decision_tree_report = classification_report(train_labels, y_pred, target_names=data[Target_Column].unique())
print(decision_tree_report)

In [None]:
decision_tree_conf_mat = confusion_matrix(train_labels, y_pred)
sns.heatmap(decision_tree_conf_mat, annot=True,  fmt='d', xticklabels=data[Target_Column].unique(), yticklabels=data[Target_Column].unique())
plt.ylabel("Actual")
plt.xlabel("Predicted")

### **Observations**

1. All models area able to give recall rate of 80% and except Naive Bayes 80% accuracy is achieved in SVM, Logistic Regression and Decision Tree.
2. SVM and decision Tree model takes much less time to train the model so they are preferred over Logistic Regression. 
3. All models performed better with the this training data distribution.

# Improving Model

Many Steps can be taken to improve the accuracy and recall for the model. Pre-trained Model and Deep Learning can be used. Feature Selection, Feature Extraction and Manual Updation in data can be done.