In [151]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

In [152]:
df = pd.read_csv("creditcard.csv")

In [174]:
df['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [175]:
# Scaling the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [176]:
# Define the feature columns and target column (assuming 'class' is the target variable)
X = df.drop('Class', axis=1)  # Features (drop the target column)
y = df['Class']  # Target variable (fraud labels)

In [177]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [178]:
# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=200, random_state=42)

In [179]:
# Train the model
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [180]:
# Predict on the test data
y_pred = model.predict(X_test)

In [182]:
# Generate classification report as a dictionary
report = classification_report(y_test, y_pred, output_dict=True)

# Extract precision and recall for the '1' class (fraudulent transactions)
precision = report['1']['precision']
recall = report['1']['recall']

# Store results in a DataFrame
results_df = pd.DataFrame({"Precision": [precision], "Recall": [recall]}, index=["Logistic Regression (Fraud Class)"])

# Print the results DataFrame
print(results_df)

                                   Precision    Recall
Logistic Regression (Fraud Class)   0.658228  0.530612


In [183]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_res, y_res = smote.fit_resample(X, y)

In [184]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

In [186]:
log_reg = LogisticRegression(max_iter=200, random_state=42)
log_reg.fit(X_train, y_train)

In [189]:
# Predict on the test set
y_pred_after_balancing = model.predict(X_test)

In [190]:
# Generate classification report as a dictionary
report_post_balancing = classification_report(y_test, y_pred_after_balancing, output_dict=True)

# Extract precision and recall for the class '1' (fraudulent transactions)
precision = report_post_balancing['1']['precision']
recall = report_post_balancing['1']['recall']

# Store results in a DataFrame
results_after_data_balancing_df = pd.DataFrame({"Precision": [precision], "Recall": [recall]}, index=["Logistic Regression (Fraud Class After Data Balancing)"])

# Print the results DataFrame
print(results_after_data_balancing_df)

                                                    Precision    Recall
Logistic Regression (Fraud Class After Data Bal...    0.99926  0.663823


In [None]:
# Define hyperparameters
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],  # Use 'liblinear' instead of 'saga'
    'class_weight': ['balanced', None]
}

# Initialize Logistic Regression
log_reg = LogisticRegression(max_iter=200)

# Apply GridSearchCV with 5-fold cross-validation and scoring based on recall
# Set n_jobs=-1 for parallel execution
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='recall', n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Output the best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)