# 1. Import Libraries


In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns


# 2. Load and Explore Dataset


In [6]:
# Load the dataset
data_path = 'creditcard.csv'  # Update this path with the actual path to your dataset
df = pd.read_csv(data_path)

# Print the first 5 rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())

# Check the distribution of the target variable
print("Distribution of target variable:")
print(df['Class'].value_counts())


First 5 rows of the dataset:
   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

   

# 3. Pre-process and Normalize the Data


In [9]:
# Check for missing values
print("Missing values in the dataset:")
print(df.isnull().sum())

# Drop rows with missing values in the target variable
df = df.dropna(subset=['Class'])

# Fill or drop missing values in the features as required
# Example: Filling missing values with the mean of the column
df.fillna(df.mean(), inplace=True)

# Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Normalize the feature data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


Missing values in the dataset:
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64


# 4. Handle Class Imbalance


In [10]:
# Handle class imbalance using SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_scaled, y)

# Check the distribution after resampling
print("Distribution after resampling:")
print(pd.Series(y_res).value_counts())


Distribution after resampling:
Class
0.0    96919
1.0    96919
Name: count, dtype: int64


# 5. Split the Dataset


In [11]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)


## Logistic Regression


In [16]:
# Train a Logistic Regression model
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)

# Predict on the test set
y_pred_logreg = logreg.predict(X_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Random Forest Classifier


In [15]:
# Train a Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf.predict(X_test)


## Logistic Regression Evaluation


In [13]:
# Evaluate Logistic Regression model
print("Logistic Regression:")
print("Precision:", precision_score(y_test, y_pred_logreg))
print("Recall:", recall_score(y_test, y_pred_logreg))
print("F1 Score:", f1_score(y_test, y_pred_logreg))
print("Classification Report:\n", classification_report(y_test, y_pred_logreg))


Logistic Regression:
Precision: 0.9775705913692062
Recall: 0.9485628618693135
F1 Score: 0.9628482972136224
Classification Report:
               precision    recall  f1-score   support

         0.0       0.95      0.98      0.96     19424
         1.0       0.98      0.95      0.96     19344

    accuracy                           0.96     38768
   macro avg       0.96      0.96      0.96     38768
weighted avg       0.96      0.96      0.96     38768



## Random Forest Evaluation


In [17]:
# Evaluate Random Forest model
print("Random Forest:")
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest:
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     19424
         1.0       1.00      1.00      1.00     19344

    accuracy                           1.00     38768
   macro avg       1.00      1.00      1.00     38768
weighted avg       1.00      1.00      1.00     38768

