**Down Sampling + Binary**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
import ast

# Load the CSV file
csv_file_path = "C:\\Users\\DELL\\Desktop\\USE\\AI-enabled-Reports-Analysis-Software\\Dataset.csv"
df = pd.read_csv(csv_file_path)

df['Embedding'] = df['Embedding'].apply(ast.literal_eval)

X = df['Embedding'].tolist()
y = df['label'].tolist()

# Calculate the count of "1" and "0" labels in the original dataset
original_counts = pd.Series(y).value_counts()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

# Convert "Approved" class to 1, and others to 0
y_train_binary = ['1' if label == 'Approved' else '0' for label in y_train]

# Apply undersampling to the "Approved" class in the training data
undersampler = RandomUnderSampler(sampling_strategy={'1': 60})
X_train_final, y_train_final = undersampler.fit_resample(X_train, y_train_binary)

# Calculate the count of "1" and "0" labels in the undersampled dataset
undersampled_counts = pd.Series(y_train_final).value_counts()

model = LogisticRegression()
model.fit(X_train_final, y_train_final)

# Convert back to original labels for prediction
y_test_binary = ['1' if label == 'Approved' else '0' for label in y_test]
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test_binary, y_pred)
classification_rep = classification_report(y_test_binary, y_pred)

print("Original Label Counts:\n", original_counts)
print("Undersampled Label Counts:\n", undersampled_counts)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)


Original Label Counts:
 Approved    198
Pending      30
Reject        8
Name: count, dtype: int64
Undersampled Label Counts:
 1    60
0    30
Name: count, dtype: int64
Accuracy: 0.8333333333333334
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.12      0.20         8
           1       0.85      0.97      0.91        40

    accuracy                           0.83        48
   macro avg       0.67      0.55      0.55        48
weighted avg       0.79      0.83      0.79        48



**DownSampling + Multinomial**


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
import ast

# Load the CSV file
csv_file_path = "C:\\Users\\DELL\\Desktop\\USE\\AI-enabled-Reports-Analysis-Software\\Dataset.csv"
df = pd.read_csv(csv_file_path)

df['Embedding'] = df['Embedding'].apply(ast.literal_eval)

X = df['Embedding'].tolist()
y = df['label'].tolist()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply undersampling only to the "Approved" class in the training data
undersampler = RandomUnderSampler(sampling_strategy={'Approved': 50})
X_train_final, y_train_final = undersampler.fit_resample(X_train, y_train)

model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1300)
model.fit(X_train_final, y_train_final)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)


Accuracy: 0.8333333333333334
Classification Report:
               precision    recall  f1-score   support

    Approved       0.84      0.97      0.90        39
     Pending       0.67      0.29      0.40         7
      Reject       0.00      0.00      0.00         2

    accuracy                           0.83        48
   macro avg       0.50      0.42      0.43        48
weighted avg       0.78      0.83      0.79        48



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**UpSampling + Binary** 

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import ast

# Load the CSV file
csv_file_path = "C:\\Users\\DELL\\Desktop\\USE\\AI-enabled-Reports-Analysis-Software\\Dataset.csv"
df = pd.read_csv(csv_file_path)

df['Embedding'] = df['Embedding'].apply(ast.literal_eval)

X = df['Embedding'].tolist()
y = df['label'].tolist()

# Combine 'Pending' and 'Reject' labels into one class (0), and keep 'Approve' as another class (1)
y_binary = [0 if label in ['Reject', 'Pending'] else 1 for label in y]

# Split the original data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Apply SMOTE oversampling to the 'Pending' and 'Reject' class in the training data only
oversampler = SMOTE(sampling_strategy={0: 90})
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Train a logistic regression model for binary classification
model = LogisticRegression(solver='lbfgs', max_iter=1300)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)


Accuracy: 0.8541666666666666
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.33      0.46         9
           1       0.86      0.97      0.92        39

    accuracy                           0.85        48
   macro avg       0.81      0.65      0.69        48
weighted avg       0.84      0.85      0.83        48

