In [1]:
# Original class distribution
import matplotlib.pyplot as plt
import numpy as np
def plot_class_distribution(list_,labels,message = "Original Class Distribution"):
    unique, counts = np.unique(list_, return_counts=True)
    colors =['green','yellow']
    if len(unique) == 3 :
        colors.append('red')
        
    plt.bar(unique, counts, color=colors)
    plt.title(message)
    
    plt.xlabel('Class')
    plt.ylabel('Number of Samples')
    plt.show()
    
def plot_class_distribution_pie(list_, labels, message="Original Class Distribution"):
    unique, counts = np.unique(list_, return_counts=True)
    plt.pie(counts, labels=labels, colors=['green','yellow', 'red'], autopct='%1.1f%%')
    plt.title(message)
    plt.show()  

In [2]:
import pandas as pd
def read_csv_data(csv_file):
    data = pd.read_csv(csv_file)
    names = data['name'].tolist()
    labels = data['label'].tolist()
    texts = data['text'].tolist()
    return names, labels, texts

In [3]:
def count_uinque_labels(labels):
    unique = [0,0,0]
    for label in labels:
        if label.lower() =='approved':
            unique[0] += 1
        elif label.lower() == 'reject':
            unique[1] += 1
        else:
            unique[2] += 1        
    return {'approved':unique[0],'reject':unique[1],'pending':unique[2]}


In [4]:
csv_file = 'Dataset.csv'
names, labels, texts = read_csv_data(csv_file)

print(count_uinque_labels(labels))


{'approved': 198, 'reject': 8, 'pending': 30}


In [5]:
from Data import Data

def addToData1(names, labels, texts, data_obj, vectorization_technique):
    for name, label, text in zip(names, labels, texts):
        vector = vectorization_technique(text.strip()).numpy().tolist()
        data_obj.add_data(name.strip(), vector, label.strip())
    return data_obj

In [6]:
from USE import apply_USE 
obj = Data()
addToData1(names, labels, texts, obj, apply_USE)

<Data.Data at 0x2f9f4662050>

In [7]:
X = obj.get_column(obj.col_document_vector)
y = obj.get_column(obj.col_decision)
print("Total number: ",len(X))
# set pending to reject

labels_count = count_uinque_labels(y)        
print(labels_count)


Total number:  236
{'approved': 198, 'reject': 8, 'pending': 30}


In [8]:
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

approvals = labels_count['approved']
rejects = labels_count['reject']
pending = labels_count['pending']
# class_proportions = 'auto'
up_ratio=1.3
down_ratio = 1.5/3
downsample_class_proportions = {'Approved':100, 'Reject':rejects,'Pending':pending }
upsample_class_proportions = {'Approved':100, 'Reject':50 , 'Pending':30 }
# downsample_class_proportions = 'auto'
# upsample_class_proportions = 'auto'
rus = RandomUnderSampler(random_state=42,sampling_strategy=downsample_class_proportions)
smote = SMOTE(random_state=42, sampling_strategy=upsample_class_proportions)

X_resampled, y_resampled = rus.fit_resample(X, y)
X_resampled, y_resampled = smote.fit_resample(X_resampled, y_resampled )




In [9]:
# original_labels = y
# resampled_labels = y_resampled
# plot_class_distribution_pie(original_labels, labels=['approved', 'pending','rejected'], message="Original Class Distribution")
# plot_class_distribution_pie(resampled_labels, labels=['approved', 'pending','rejected'], message="Resampled Class Distribution")

# plot_class_distribution(original_labels, labels=['Class 0', 'Class 1'], message="Original Class Distribution")
# plot_class_distribution(resampled_labels, labels=['Class 0', 'Class 1'], message="Resampled Class Distribution")
print("Before Resampling: ",Counter(y))
print("After Resampling: ",Counter(y_resampled))



Before Resampling:  Counter({'Approved': 198, 'Pending': 30, 'Reject': 8})
After Resampling:  Counter({'Approved': 100, 'Reject': 50, 'Pending': 30})


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [11]:
from sklearn.metrics import f1_score, accuracy_score, classification_report

class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

    Approved       0.87      0.95      0.91        21
     Pending       1.00      0.50      0.67         6
      Reject       0.90      1.00      0.95         9

    accuracy                           0.89        36
   macro avg       0.92      0.82      0.84        36
weighted avg       0.90      0.89      0.88        36



# approved *1/3  : , rejected *1.3 , pending *1
```
Classification Report:
               precision    recall  f1-score   support

    Approved       0.67      0.86      0.75        14
     Pending       0.50      0.33      0.40         6
      Reject       0.00      0.00      0.00         2

    accuracy                           0.64        22
   macro avg       0.39      0.40      0.38        22
weighted avg       0.56      0.64      0.59        22

```

# approved *1.5/3  : , rejected auto , pending auto

```
Classification Report:
               precision    recall  f1-score   support

    Approved       0.90      0.75      0.82        24
     Pending       0.76      0.84      0.80        19
      Reject       0.89      1.00      0.94        17

    accuracy                           0.85        60
   macro avg       0.85      0.86      0.85        60
weighted avg       0.85      0.85      0.85        60

```

## approved =100  : , rejected = 50 , pending = 30

```
Classification Report:
               precision    recall  f1-score   support

    Approved       0.87      0.95      0.91        21
     Pending       1.00      0.50      0.67         6
      Reject       0.90      1.00      0.95         9

    accuracy                           0.89        36
   macro avg       0.92      0.82      0.84        36
weighted avg       0.90      0.89      0.88        36
```

