In [1]:
# Import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE

# Load the dataset from a local file
df = pd.read_csv("creditcard.csv")

# Split the dataset into features (X) and class labels (y)
X = df.drop("Class", axis=1)
y = df["Class"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classifier on the training set using all available features
clf = LogisticRegression(max_iter=1500)
clf.fit(X_train, y_train)

# Use the trained classifier to predict the class labels of the testing set
y_pred = clf.predict(X_test)

# Calculate the classification performance metrics such as AUC-Roc, accuracy, precision, recall, and F1-score.
auc_roc = roc_auc_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the performance metrics of the full feature set
print("Performance metrics of the full feature set (without SMOTE):")
print(f"AUC-ROC: {auc_roc}")
print(f"Accuracy: {acc}")
print(f"Precision: {prec}")
print(f"Recall: {rec}")
print(f"F1-score: {f1}")

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Train a classifier on the SMOTE training set using all available features
clf_smote = LogisticRegression(max_iter=1500)
clf_smote.fit(X_train_smote, y_train_smote)

# Use the trained classifier to predict the class labels of the testing set
y_pred_smote = clf_smote.predict(X_test)

# Calculate the classification performance metrics such as AUC-Roc, accuracy, precision, recall, and F1-score.
auc_roc_smote = roc_auc_score(y_test, y_pred_smote)
acc_smote = accuracy_score(y_test, y_pred_smote)
prec_smote = precision_score(y_test, y_pred_smote)
rec_smote = recall_score(y_test, y_pred_smote)
f1_smote = f1_score(y_test, y_pred_smote)

# Print the performance metrics of the full feature set after applying SMOTE
print("\nPerformance metrics of the full feature set (with SMOTE):")
print(f"AUC-ROC: {auc_roc_smote}")
print(f"Accuracy: {acc_smote}")
print(f"Precision: {prec_smote}")
print(f"Recall: {rec_smote}")
print(f"F1-score: {f1_smote}")


# Initialize a list to store the performance metrics of each feature subset
perf_list = []

# Loop over all features in the dataset
for i in range(X.shape[1]):
    # Remove one feature at a time from the training set
    X_train_reduced = X_train.drop(X.columns[i], axis=1)

    # Train the classifier using the remaining features
    clf_reduced = LogisticRegression()
    clf_reduced.fit(X_train_reduced, y_train)

    # Use the trained classifier to predict the class labels of the testing set
    y_pred_reduced = clf_reduced.predict(X_test.drop(X.columns[i], axis=1))

    # Calculate the classification performance metrics such as accuracy, precision, recall, and F1-score
    auc_roc_reduced = roc_auc_score(y_test, y_pred_reduced)
    acc_reduced = accuracy_score(y_test, y_pred_reduced)
    prec_reduced = precision_score(y_test, y_pred_reduced)
    rec_reduced = recall_score(y_test, y_pred_reduced)
    f1_reduced = f1_score(y_test, y_pred_reduced)

    # Append the performance metrics of the reduced feature set to the list
    perf_list.append([auc_roc_reduced, acc_reduced, prec_reduced, rec_reduced, f1_reduced])

# Convert the list to a numpy array for easier manipulation
perf_array = np.array(perf_list)

# Identify the feature(s) that resulted in the highest classification performance metric(s) when removed and retain only those features
best_features = []
for j in range(4):
    # Find the index of the maximum value in each column of the performance array
    max_index = np.argmax(perf_array[:,j])

    # Append the corresponding feature name to the best features list
    best_features.append(X.columns[max_index])

# Remove any duplicates from the best features list
best_features = list(set(best_features))

# Print the best features list
print("Best features to retain(for LR):")
print(best_features)
# Train the classifier on the reduced feature set and evaluate its performance on the testing set
# Without SMOTE
X_train_best = X_train[best_features]
X_test_best = X_test[best_features]
clf_best = LogisticRegression()
clf_best.fit(X_train_best, y_train)
y_pred_best = clf_best.predict(X_test_best)
auc_roc_best = roc_auc_score(y_test, y_pred_best)
acc_best = accuracy_score(y_test, y_pred_best)
prec_best = precision_score(y_test, y_pred_best)
rec_best = recall_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best)

# With SMOTE
X_train_best_smote, y_train_best_smote = smote.fit_resample(X_train_best, y_train)
clf_best_smote = LogisticRegression()
clf_best_smote.fit(X_train_best_smote, y_train_best_smote)
y_pred_best_smote = clf_best_smote.predict(X_test_best)
auc_roc_best_smote = roc_auc_score(y_test, y_pred_best_smote)
acc_best_smote = accuracy_score(y_test, y_pred_best_smote)
prec_best_smote = precision_score(y_test, y_pred_best_smote)
rec_best_smote = recall_score(y_test, y_pred_best_smote)
f1_best_smote = f1_score(y_test, y_pred_best_smote)

# Print the performance metrics of the reduced feature set without SMOTE.
print("\nPerformance metrics of the reduced feature set (without SMOTE):")
print(f"AUC-ROC: {auc_roc_best}")
print(f"Accuracy: {acc_best}")
print(f"Precision: {prec_best}")
print(f"Recall: {rec_best}")
print(f"F1-score: {f1_best}")


# Print the performance metrics of the reduced feature set with SMOTE.
print("\nPerformance metrics of the reduced feature set (with SMOTE):")
print(f"AUC-ROC: {auc_roc_best_smote}")
print(f"Accuracy: {acc_best_smote}")
print(f"Precision: {prec_best_smote}")
print(f"Recall: {rec_best_smote}")
print(f"F1-score: {f1_best_smote}")



Performance metrics of the full feature set (without SMOTE):
AUC-ROC: 0.7601073596292767
Accuracy: 0.9989817773252344
Precision: 0.8225806451612904
Recall: 0.5204081632653061
F1-score: 0.6375000000000001

Performance metrics of the full feature set (with SMOTE):
AUC-ROC: 0.9391491191299255
Accuracy: 0.9801973245321443
Precision: 0.07296849087893864
Recall: 0.8979591836734694
F1-score: 0.13496932515337423


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best features to retain(for LR):
['Time', 'V12', 'V5']

Performance metrics of the reduced feature set (without SMOTE):
AUC-ROC: 0.7142065781011335
Accuracy: 0.9988588883817282
Precision: 0.8235294117647058
Recall: 0.42857142857142855
F1-score: 0.5637583892617449

Performance metrics of the reduced feature set (with SMOTE):
AUC-ROC: 0.8362614917942417
Accuracy: 0.8662968294652575
Precision: 0.010291818655549765
Recall: 0.8061224489795918
F1-score: 0.020324157447903266
