# Importing required libraries


In [154]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV



# Loading the dataset

In [155]:
credit_data = pd.read_csv('creditcard.csv')
#display all rows
#pd.set_option('display.max_rows', None)

# Basic data exploration

In [156]:

print(f"First 5 rows:\n{credit_data.head(5)}\n")


First 5 rows:
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       

In [157]:

print(f"Last 5 rows:\n{credit_data.tail()}\n")


Last 5 rows:
            Time         V1         V2        V3        V4        V5  \
284802  172786.0 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
284803  172787.0  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
284804  172788.0   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
284805  172788.0  -0.240440   0.530483  0.702510  0.689799 -0.377961   
284806  172792.0  -0.533413  -0.189733  0.703337 -0.506271 -0.012546   

              V6        V7        V8        V9  ...       V21       V22  \
284802 -2.606837 -4.918215  7.305334  1.914428  ...  0.213454  0.111864   
284803  1.058415  0.024330  0.294869  0.584800  ...  0.214205  0.924384   
284804  3.031260 -0.296827  0.708417  0.432454  ...  0.232045  0.578229   
284805  0.623708 -0.686180  0.679145  0.392087  ...  0.265245  0.800049   
284806 -0.649617  1.577006 -0.414650  0.486180  ...  0.261057  0.643078   

             V23       V24       V25       V26       V27       V28  Amount  \
284802  1.014480 -0.50934

In [158]:

print(f"Dataset information:\n{credit_data.info()}\n")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [159]:

print(f"Missing values per column:\n{credit_data.isnull().sum()}\n")


Missing values per column:
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64



In [160]:
print(f"Class distribution:\n{credit_data['Class'].value_counts()}\n")


Class distribution:
0    284315
1       492
Name: Class, dtype: int64



# Note: The target variable 'Class' is imbalanced, with 0 representing legitimate transactions and 1 representing fraudulent transactions.


# Separating data based on class

In [161]:
legit_data = credit_data[credit_data['Class'] == 0]
fraud_data = credit_data[credit_data['Class'] == 1]

# Analyzing statistical measures

In [162]:
print(f"Legitimate transactions - Amount statistics:\n{legit_data['Amount'].describe()}\n")
print(f"Fraudulent transactions - Amount statistics:\n{fraud_data['Amount'].describe()}\n")
print(f"Mean values by class:\n{credit_data.groupby('Class').mean()}\n")

Legitimate transactions - Amount statistics:
count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

Fraudulent transactions - Amount statistics:
count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

Mean values by class:
               Time        V1        V2        V3        V4        V5  \
Class                                                                   
0      94838.202258  0.008258 -0.006271  0.012171 -0.007860  0.005453   
1      80746.806911 -4.771948  3.623778 -7.033281  4.542029 -3.151225   

             V6        V7        V8        V9  ...       V20       V21  \
Class                                          ...                       
0      0.002419  0.009637 -0.000987


# Under-sampling legitimate transactions to match fraudulent transactions


In [163]:

num_fraud = fraud_data.shape[0]
legit_sample = legit_data.sample(n=num_fraud, random_state=42)
new_dataset = pd.concat([legit_sample, fraud_data], axis=0)



# Checking the new dataset


In [164]:
print(f"New dataset head:\n{new_dataset.head()}\n")
print(f"New dataset tail:\n{new_dataset.tail()}\n")
print(f"New dataset class distribution:\n{new_dataset['Class'].value_counts()}\n")
print(f"New dataset mean values by class:\n{new_dataset.groupby('Class').mean()}\n")


New dataset head:
            Time        V1        V2        V3        V4        V5        V6  \
138028   82450.0  1.314539  0.590643 -0.666593  0.716564  0.301978 -1.125467   
63099    50554.0 -0.798672  1.185093  0.904547  0.694584  0.219041 -0.319295   
73411    55125.0 -0.391128 -0.245540  1.122074 -1.308725 -0.639891  0.008678   
164247  116572.0 -0.060302  1.065093 -0.987421 -0.029567  0.176376 -1.348539   
148999   90434.0  1.848433  0.373364  0.269272  3.866438  0.088062  0.970447   

              V7        V8        V9  ...       V21       V22       V23  \
138028  0.388881 -0.288390 -0.132137  ... -0.170307 -0.429655 -0.141341   
63099   0.495236  0.139269 -0.760214  ...  0.202287  0.578699 -0.092245   
73411  -0.701304 -0.027315 -2.628854  ... -0.133485  0.117403 -0.191748   
164247  0.775644  0.134843 -0.149734  ...  0.355576  0.907570 -0.018454   
148999 -0.721945  0.235983  0.683491  ...  0.103563  0.620954  0.197077   

             V24       V25       V26       V27    


# Feature Scaling


In [165]:

scaler = StandardScaler()
X = new_dataset.drop('Class', axis=1)
X_scaled = scaler.fit_transform(X)



# Feature Selection


In [166]:

selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X_scaled, new_dataset['Class'])



# Splitting data into features and target


In [167]:

y = new_dataset['Class']



# Splitting data into train and test sets


In [168]:

X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, stratify=y, random_state=42)
print(f"Original dataset shape: {X_selected.shape}")
print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")


Original dataset shape: (984, 10)
Training set shape: (787, 10), Test set shape: (197, 10)



# Hyperparameter Tuning


In [169]:

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_



# Evaluating the model


In [170]:

y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Training accuracy: {train_accuracy:.4f}")
print(f"Test accuracy: {test_accuracy:.4f}")

print("Classification Report:")
print(classification_report(y_test, y_test_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

Training accuracy: 0.9365
Test accuracy: 0.9594
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96        99
           1       0.99      0.93      0.96        98

    accuracy                           0.96       197
   macro avg       0.96      0.96      0.96       197
weighted avg       0.96      0.96      0.96       197

Confusion Matrix:
[[98  1]
 [ 7 91]]


<!-- README.md -->
<style>
h1 {
  color: #333;
  font-family: Arial, sans-serif;
  text-align: center;
}

h2 {
  color: #666;
  font-family: Arial, sans-serif;
}

code {
  background-color: #f5f5f5;
  padding: 2px 4px;
  font-family: Consolas, monospace;
  border-radius: 4px;
}

p {
  font-family: Arial, sans-serif;
  line-height: 1.5;
}

.highlighted {
  color: #e83e8c;
  font-weight: bold;
}
</style>

# Predictive Modeling for Credit Card Fraud Detection

This project focuses on building a machine learning model to detect fraudulent credit card transactions. The dataset used is highly imbalanced, with a significantly higher number of legitimate transactions compared to fraudulent ones.

## Data Preprocessing

The code starts by loading the dataset and performing exploratory data analysis (EDA) to understand the data distribution, missing values, and class imbalance. To address the imbalance, the legitimate transactions are undersampled to match the number of fraudulent transactions, creating a balanced dataset.

## Feature Engineering

The code then applies feature scaling using <code class="highlighted">StandardScaler</code> to ensure that all features are on a similar scale. Additionally, feature selection is performed using <code class="highlighted">SelectKBest</code> and <code class="highlighted">f_classif</code> to identify the top 10 most relevant features for the classification task.

## Model Training and Evaluation

The data is split into training and testing sets, and a Logistic Regression model is trained. To find the optimal hyperparameters, a <code class="highlighted">GridSearchCV</code> is employed with a range of regularization strengths (<code class="highlighted">C</code> values) for the Logistic Regression model.

The trained model is then evaluated on both the training and testing sets using various metrics, including accuracy, classification report, and confusion matrix. The classification report provides valuable insights into the model's performance for each class, including precision, recall, and F1-score. The confusion matrix helps identify the number of true positives, true negatives, false positives, and false negatives.

## Conclusion

The provided code demonstrates a complete pipeline for credit card fraud detection, from data exploration and preprocessing to model training, hyperparameter tuning, and evaluation. By addressing the class imbalance, performing feature engineering, and leveraging appropriate evaluation metrics, this approach aims to build a robust and reliable model for identifying fraudulent transactions.





<div style="background-color: #f0f0f0; padding: 20px; border-radius: 5px; box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.1); text-align: center;">
  <h2 style="color: #333;">Thank you for visiting!</h2>
  <p style="color: #666;">Your feedback and contributions are highly appreciated.</p>
</div>
