In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, precision_score, recall_score
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("creditcard.csv")

In [4]:
df['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [5]:
# Define the feature columns and target column (assuming 'class' is the target variable)
X = df.drop('Class', axis=1)  # Features (drop the target column)
y = df['Class']  # Target variable (fraud labels)

In [6]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=200, random_state=42)

In [8]:
# Train the model
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# Predict on the test data
y_pred = model.predict(X_test)

In [10]:
# Generate classification report as a dictionary
report = classification_report(y_test, y_pred, output_dict=True)

# Extract precision and recall for the '1' class (fraudulent transactions)
precision = report['1']['precision']
recall = report['1']['recall']

# Store results in a DataFrame
results_df = pd.DataFrame({"Precision": [precision], "Recall": [recall]}, index=["Logistic Regression (Fraud Class)"])

# Print the results DataFrame
print(results_df)

                                   Precision    Recall
Logistic Regression (Fraud Class)   0.658228  0.530612


In [11]:
results_df

Unnamed: 0,Precision,Recall
Logistic Regression (Fraud Class),0.658228,0.530612


In [12]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_res, y_res = smote.fit_resample(X, y)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

In [14]:
log_reg = LogisticRegression(max_iter=200, random_state=42)
log_reg.fit(X_train, y_train)

In [15]:
# Predict on the test set
y_pred_after_balancing = model.predict(X_test)

In [16]:
# Compute Precision & Recall
precision = precision_score(y_test, y_pred_after_balancing)
recall = recall_score(y_test, y_pred_after_balancing)

# Store results
results_df.loc["Logistic Regression (Balanced)"] = [precision, recall]

# Print results
print(results_df)

                                   Precision    Recall
Logistic Regression (Fraud Class)   0.658228  0.530612
Logistic Regression (Balanced)      0.999260  0.663823


In [17]:
from sklearn.metrics import confusion_matrix

In [18]:
# Assuming you have actual and predicted values
cm = confusion_matrix(y_test, y_pred_after_balancing)

TN, FP, FN, TP = cm.ravel()

print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

False Positives (FP): 28
False Negatives (FN): 19154


In [19]:
# Define costs
FP_cost = 5
FN_cost = 50

# Compute total cost
total_cost = (FP * FP_cost) + (FN * FN_cost)

print(f"Total Cost of Misclassification: £{total_cost}")

Total Cost of Misclassification: £957840


In [20]:
y_pred_prob = model.predict_proba(X_test)[:, 1]  # Get probability scores
threshold = 0.3  # Adjust threshold
y_pred_adjusted = (y_pred_prob >= threshold).astype(int)

# Recompute confusion matrix with new threshold
cm_adjusted = confusion_matrix(y_test, y_pred_adjusted)
TN, FP, FN, TP = cm_adjusted.ravel()

# Recalculate cost
total_cost_adjusted = (FP * FP_cost) + (FN * FN_cost)
print(f"Total Cost with Threshold {threshold}: £{total_cost_adjusted}")

Total Cost with Threshold 0.3: £841260


In [21]:
import time

In [22]:
start_pred_time = time.time()
y_pred_after_balancing = log_reg.predict(X_test)
end_pred_time = time.time()

pred_time = end_pred_time - start_pred_time
print(f"Prediction Time: {pred_time:.4f} seconds")

Prediction Time: 0.0457 seconds
