In [1]:
# Original class distribution
import matplotlib.pyplot as plt
import numpy as np
def plot_class_distribution(list_,labels,message = "Original Class Distribution"):
    unique, counts = np.unique(list_, return_counts=True)
    colors =['green','yellow']
    if len(unique) == 3 :
        colors.append('red')
        
    plt.bar(unique, counts, color=colors)
    plt.title(message)
    plt.xlabel('Class')
    plt.ylabel('Number of Samples')
    plt.show()
    
def plot_class_distribution_pie(list_, labels, message="Original Class Distribution"):
    unique, counts = np.unique(list_, return_counts=True)
    plt.pie(counts, labels=labels, colors=['green','yellow', 'red'], autopct='%1.1f%%')
    plt.title(message)
    plt.show()  

In [2]:
import pandas as pd
def read_csv_data(csv_file):
    data = pd.read_csv(csv_file)
    names = data['name'].tolist()
    labels = data['label'].tolist()
    texts = data['text'].tolist()
    return names, labels, texts

In [3]:
def count_uinque_labels(labels):
    unique = [0,0,0]
    for label in labels:
        if label.lower() =='approved':
            unique[0] += 1
        elif label.lower() == 'reject':
            unique[1] += 1
        else:
            unique[2] += 1        
    return {'approved':unique[0],'reject':unique[1],'pending':unique[2]}


In [4]:
csv_file = 'Dataset.csv'
names, labels, texts = read_csv_data(csv_file)

print(count_uinque_labels(labels))


{'approved': 198, 'reject': 8, 'pending': 30}


In [5]:
from Data import Data

def addToData1(names, labels, texts, data_obj, vectorization_technique):
    for name, label, text in zip(names, labels, texts):
        vector = vectorization_technique(text.strip()).numpy().tolist()
        data_obj.add_data(name.strip(), vector, label.strip())
    return data_obj

In [6]:
from USE import apply_USE 
obj = Data()
addToData1(names, labels, texts, obj, apply_USE)

In [None]:
X = obj.get_column(obj.col_document_vector)
y = obj.get_column(obj.col_decision)
print("Total number: ",len(X))
# set pending to reject
for i in range(len(y)):
    if y[i] == 'Pending':
        y[i]='Reject'  
labels_count = count_uinque_labels(y)        
print(labels_count)


Total number:  236
{'approved': 198, 'reject': 38, 'pending': 0}


In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

approvals = labels_count['approved']
rejects = labels_count['reject']
pending = labels_count['pending']
# class_proportions = 'auto'
upsample_class_proportions = {'Approved':approvals, 'Reject':rejects*2 }

smote = SMOTE(random_state=42, sampling_strategy=upsample_class_proportions)

X_resampled, y_resampled = smote.fit_resample(X, y)




In [None]:
# original_labels = y
# resampled_labels = y_resampled
# plot_class_distribution_pie(original_labels, labels=['approved', 'pending','rejected'], message="Original Class Distribution")
# plot_class_distribution_pie(resampled_labels, labels=['approved', 'pending','rejected'], message="Resampled Class Distribution")

# plot_class_distribution(original_labels, labels=['Class 0', 'Class 1'], message="Original Class Distribution")
# plot_class_distribution(resampled_labels, labels=['Class 0', 'Class 1'], message="Resampled Class Distribution")
print("Before Resampling: ",Counter(y))
print("After Resampling: ",Counter(y_resampled))



Before Resampling:  Counter({'Approved': 198, 'Reject': 38})
After Resampling:  Counter({'Approved': 198, 'Reject': 76})


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import f1_score, accuracy_score, classification_report

class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

    Approved       0.81      0.97      0.89        36
      Reject       0.92      0.58      0.71        19

    accuracy                           0.84        55
   macro avg       0.87      0.78      0.80        55
weighted avg       0.85      0.84      0.83        55



### **Using auto upsampling for rejected(rejection is upsampled to be equal to approvals)**
`
**Classification Report:**

|           | Precision | Recall | F1-Score | Support |
|:---------:|:---------:|:------:|:--------:|:-------:|
| Approved  |   0.94    |  0.81  |   0.87   |    42   |
|   Reject  |   0.82    |  0.95  |   0.88   |    38   |
| **Macro Avg** |   0.88    |  0.88  |   0.87   |    80   |
| **Weighted Avg** |   0.88    |  0.88  |   0.87   |    80   |

- Accuracy: 0.88


### **Upsampling rejections to its double**

**Classification Report:**

|           | Precision | Recall | F1-Score | Support |
|:---------:|:---------:|:------:|:--------:|:-------:|
| Approved  |   0.81    |  0.97  |   0.89   |    36   |
|   Reject  |   0.92    |  0.58  |   0.71   |    19   |
| **Macro Avg** |   0.87    |  0.78  |   0.80   |    55   |
| **Weighted Avg** |   0.85    |  0.84  |   0.83   |    55   |

- Accuracy: 0.84

