# Credit card data

### About the dataset


This dataset can be found in the following [link](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud)

This dataset has information about the credit card fraud



In [56]:
# Credit card classification

# import statement
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


def load_data(file_name):
    # load dataset
    dataset = pd.read_csv(file_name)
    
    # replace the cells with single and extra spaces with a Nan value
    dataset.replace(r'^\s*$', np.nan, regex=True, inplace = True)
    
    # convert the dataset to dataframe
    dataset = pd.DataFrame(dataset)
    
    return dataset

dataset = load_data('creditcard.csv')

# print the shape of the dataset
print(dataset.shape)

# print the datatypes of the dataset
print(dataset.dtypes)

# head and tail of the dataset
print(dataset.head(2))
print(dataset.tail(2))


(284807, 31)
Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class       int64
dtype: object
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102

## Check for the missing values

we will check for the missing values in the dataset.

- If there are any missing values, we will replace them with the mean of the column

In [57]:
dataset.isnull().sum()  # There are no missing values in the dataset

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

## Accounting for the bias of the dataset

We will check for the bias in the dataset and try to remove it.

- In this dataset there is a bias of 99.8% of non-fraudulent transactions and 0.2% of fraudulent transactions.
- We will try to remove this bias by using the following methods:
    - Keep all the fraudulent transactions and randomly select the roughly twice of the number of fraudulent transactions from the non-fraudulent transactions.


In [58]:
# check for the bias in the dataset
dataset['Class'].value_counts()  # The dataset is biased

# create a new dataframe with all the rows with class 1 and 
# 1% of the rows with class 0
df1 = dataset[dataset['Class'] == 1]
df0 = dataset[dataset['Class'] == 0].sample(frac = 0.01)

# concatenate the two dataframes
dataset = pd.concat([df1, df0], axis = 0)

counts = dataset['Class'].value_counts()

print(counts)


Class
0    2843
1     492
Name: count, dtype: int64


## Scaling the data 

We will scale the data using the `minmax_scaler` from `sklearn.preprocessing`

In [59]:
def scale_data(dataset):
    # get the column names of each column without the last column
    columns = dataset.columns[:-1]
    # for each column, get the max and min values
    for column in columns:
        max_value = dataset[column].max()
        min_value = dataset[column].min()
        dataset[column] = (dataset[column] - min_value)/(max_value - min_value)
        
    return dataset

dataset = scale_data(dataset)

# print the head of the dataset
print(dataset.head(2))


         Time        V1        V2        V3        V4        V5        V6  \
541  0.002145  0.874410  0.607334  0.845728  0.495592  0.650089  0.252074   
623  0.002528  0.854614  0.507549  0.923101  0.389371  0.706774  0.270385   

          V7        V8        V9  ...       V21       V22       V23       V24  \
541  0.61194  0.695084  0.498968  ...  0.466293  0.513187  0.609497  0.484688   
623  0.65465  0.671179  0.615903  ...  0.469182  0.540466  0.669223  0.377722   

          V25       V26       V27       V28    Amount  Class  
541  0.603535  0.361456  0.694520  0.183784  0.000000      1  
623  0.632958  0.284087  0.647086  0.193047  0.124519      1  

[2 rows x 31 columns]


## Divide the data into dependent and independent variables

We will divide the data into dependent and independent variables

- The dependent variable is `Class`
- The independent variables are the rest of the columns

In [60]:
# function to divide the dataset into dependent and independent variables
def divide_dataset(dataset):
    # divide the dataset into x and y
    dataset_columns_length = len(dataset.columns)
    print(dataset_columns_length)

    x = dataset.iloc[:, 0:(dataset_columns_length-1)].values
    y = dataset.iloc[:, (dataset_columns_length-1)].values
    
    return x, y

x, y = divide_dataset(dataset)

# print the shape of x and y
print(x.shape)
print(y.shape)
print(x[0])
print(y[0])

31
(3335, 30)
(3335,)
[0.0021454  0.87441016 0.60733428 0.8457275  0.49559176 0.65008857
 0.25207422 0.61194017 0.69508431 0.49896826 0.59968213 0.42903751
 0.73290202 0.3979715  0.62048809 0.54682354 0.67822092 0.70003809
 0.71351337 0.46001643 0.69868558 0.46629259 0.51318723 0.60949692
 0.484688   0.60353476 0.36145596 0.69452029 0.18378381 0.        ]
1


## Split the data into train and test sets

We will split the data into train and test sets using the `train_test_split` from `sklearn.model_selection`

- The train set will have 70% of the data.

In [61]:
# import train_test_split
from sklearn.model_selection import train_test_split

# funciton to split the dataset into training and testing set
def split_dataset(x, y):
    # split the dataset into training and testing set
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)
    
    return x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = split_dataset(x, y)

# print the shape of x_train, x_test, y_train, y_test
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
print(x_train.dtype)
print(x_test.dtype)
print(y_train.dtype)
print(y_test.dtype)

(2334, 30)
(1001, 30)
(2334,)
(1001,)
float64
float64
int64
int64


### Model

- import the custom model from `custom_logistic_regression.py`

In [62]:
from custom_logistic_regression import CustomLogisticRegression

# create an object of CustomLogisticRegression
classifier = CustomLogisticRegression(learning_rate=0.01, early_stopping_threshold=0.01, verbose=True, num_features=20)

classifier.fit(x_train, y_train)

# predict the values
y_pred = classifier.predict(x_test)

# print the accuracy
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

loss: 0.6817225298714803 	


[[855   0]
 [ 68  78]]
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       855
           1       1.00      0.53      0.70       146

    accuracy                           0.93      1001
   macro avg       0.96      0.77      0.83      1001
weighted avg       0.94      0.93      0.92      1001

0.932067932067932
