In [2]:

import pandas as pd

# load dataset
fraud_df = pd.read_csv("/content/sample_data/creditcard.csv")

In [4]:
print(f"Dataset Shape :- \n {fraud_df.shape}")

Dataset Shape :- 
 (21878, 31)


In [5]:
print(f"Columns or Feature names :- \n {fraud_df.columns}")

Columns or Feature names :- 
 Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')


In [6]:
print(f"Unique values of target variable :- \n {fraud_df['Class'].unique()}")

Unique values of target variable :- 
 [ 0.  1. nan]


In [7]:
print(f"Number of samples under each target value :- \n {fraud_df['Class'].value_counts()}")

Number of samples under each target value :- 
 Class
0.0    21791
1.0       86
Name: count, dtype: int64


In [8]:
fraud_df = fraud_df.drop(['Time'], axis=1)
print(f"list of feature names after removing Time column :- \n{fraud_df.columns}")

list of feature names after removing Time column :- 
Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class'],
      dtype='object')


In [9]:

print(f"Dataset info :- \n {fraud_df.info()}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21878 entries, 0 to 21877
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      21878 non-null  float64
 1   V2      21878 non-null  float64
 2   V3      21878 non-null  float64
 3   V4      21878 non-null  float64
 4   V5      21878 non-null  float64
 5   V6      21878 non-null  float64
 6   V7      21878 non-null  float64
 7   V8      21878 non-null  float64
 8   V9      21878 non-null  float64
 9   V10     21878 non-null  float64
 10  V11     21878 non-null  float64
 11  V12     21878 non-null  float64
 12  V13     21878 non-null  float64
 13  V14     21878 non-null  float64
 14  V15     21878 non-null  float64
 15  V16     21878 non-null  float64
 16  V17     21878 non-null  float64
 17  V18     21878 non-null  float64
 18  V19     21878 non-null  float64
 19  V20     21878 non-null  float64
 20  V21     21878 non-null  float64
 21  V22     21878 non-null  float64
 22

In [10]:
print(f"few values of Amount column :- \n {fraud_df['Amount'][0:4]}")

few values of Amount column :- 
 0    149.62
1      2.69
2    378.66
3    123.50
Name: Amount, dtype: float64


In [11]:
from sklearn.preprocessing import StandardScaler
fraud_df['norm_amount'] = StandardScaler().fit_transform(
fraud_df['Amount'].values.reshape(-1,1))
fraud_df = fraud_df.drop(['Amount'], axis=1)
print(f"few values of Amount column after applying StandardScaler:- \n {fraud_df['norm_amount'][0:4]}")

few values of Amount column after applying StandardScaler:- 
 0    0.379919
1   -0.336709
2    1.497025
3    0.252523
Name: norm_amount, dtype: float64


In [12]:
X = fraud_df.drop(['Class'], axis=1)
y = fraud_df[['Class']]

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(15314, 29)
(6564, 29)
(15314, 1)
(6564, 1)


In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer # Import SimpleImputer to handle NaNs

def decision_tree_classification(X_train, y_train, X_test, y_test):
    # initialize object for DecisionTreeClassifier class
    dt_classifier = DecisionTreeClassifier()

    # Handle missing values using SimpleImputer
    imputer = SimpleImputer(strategy='mean') # Replace NaNs with mean of column
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    # Handle missing values in y_train and y_test using a strategy suitable for classification
    imputer_y = SimpleImputer(strategy='most_frequent') # Replace NaNs with most frequent class
    y_train = imputer_y.fit_transform(y_train)
    y_test = imputer_y.transform(y_test)

    # train model by using fit method
    print("Model training starts........")
    dt_classifier.fit(X_train, y_train.ravel()) # Use ravel() to flatten y_train
    print("Model training completed")
    acc_score = dt_classifier.score(X_test, y_test)
    print(f'Accuracy of model on test dataset :- {acc_score}')
    # predict result using test dataset
    y_pred = dt_classifier.predict(X_test)
    # confusion matrix
    print(f"Confusion Matrix :- \n {confusion_matrix(y_test, y_pred)}")
    # classification report for f1-score
    print(f"Classification Report :- \n {classification_report(y_test, y_pred)}")



# calling decision_tree_classification method to train and evaluate model
decision_tree_classification(X_train, y_train, X_test, y_test)

Model training starts........
Model training completed
Accuracy of model on test dataset :- 0.9986288848263254
Confusion Matrix :- 
 [[6531    4]
 [   5   24]]
Classification Report :- 
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      6535
         1.0       0.86      0.83      0.84        29

    accuracy                           1.00      6564
   macro avg       0.93      0.91      0.92      6564
weighted avg       1.00      1.00      1.00      6564



In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer # Import SimpleImputer

def random_forest_classifier(X_train, y_train, X_test, y_test):
     # initialize object for DecisionTreeClassifier class
     rf_classifier = RandomForestClassifier(n_estimators=50)

     # Handle missing values using SimpleImputer
     imputer = SimpleImputer(strategy='mean') # Replace NaNs with mean of column
     X_train = imputer.fit_transform(X_train)
     X_test = imputer.transform(X_test)

     # Handle missing values in y_train and y_test
     imputer_y = SimpleImputer(strategy='most_frequent') # Use most frequent for classification target
     y_train = imputer_y.fit_transform(y_train)
     y_test = imputer_y.transform(y_test)

     # train model by using fit method
     print("Model training starts........")
     rf_classifier.fit(X_train, y_train.ravel()) # Remove .values since y_train is already a NumPy array
     acc_score = rf_classifier.score(X_test, y_test)
     print(f'Accuracy of model on test dataset :- {acc_score}')
     # predict result using test dataset
     y_pred = rf_classifier.predict(X_test)
     # confusion matrix
     print(f"Confusion Matrix :- \n {confusion_matrix(y_test, y_pred)}")
     # classification report for f1-score
     print(f"Classification Report :- \n {classification_report(y_test, y_pred)}")


# calling random_forest_classifier
random_forest_classifier(X_train, y_train, X_test, y_test)

Model training starts........
Accuracy of model on test dataset :- 0.9995429616087751
Confusion Matrix :- 
 [[6535    0]
 [   3   26]]
Classification Report :- 
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      6535
         1.0       1.00      0.90      0.95        29

    accuracy                           1.00      6564
   macro avg       1.00      0.95      0.97      6564
weighted avg       1.00      1.00      1.00      6564



In [24]:
print(f"Number of samples under each target value :- \n {fraud_df['Class'].value_counts()}")

Number of samples under each target value :- 
 Class
0.0    21791
1.0       86
Name: count, dtype: int64


In [26]:
class_val = fraud_df['Class'].value_counts().to_frame()
print(f"Number of samples for each class :- \n {class_val}")
non_fraud = class_val.iloc[0,0]  # Access the first row and first column (count for class 0)
fraud = class_val.iloc[1,0]   # Access the second row and first column (count for class 1)
print(f"Non Fraudulent Numbers :- {non_fraud}")
print(f"Fraudulent Numbers :- {fraud}")

Number of samples for each class :- 
        count
Class       
0.0    21791
1.0       86
Non Fraudulent Numbers :- 21791
Fraudulent Numbers :- 86


In [28]:
import numpy as np  # Import NumPy library

nonfraud_indexies = fraud_df[fraud_df.Class == 0].index
fraud_indices = np.array(fraud_df[fraud_df['Class'] == 1].index)  # Now np.array is recognized
# take random samples from non fraudulent that are equal to fraudulent samples
random_normal_indexies = np.random.choice(nonfraud_indexies, fraud, replace=False)
random_normal_indexies = np.array(random_normal_indexies)

In [29]:
nonfraud_indexies = fraud_df[fraud_df.Class == 0].index
fraud_indices = np.array(fraud_df[fraud_df['Class'] == 1].index)
# take random samples from non fraudulent that are equal to fraudulent samples
random_normal_indexies = np.random.choice(nonfraud_indexies, fraud, replace=False)
random_normal_indexies = np.array(random_normal_indexies)


## Undersampling techniques

# concatenate both indices of fraud and non fraud
under_sample_indices = np.concatenate([fraud_indices, random_normal_indexies])

#extract all features from whole data for under sample indices only
under_sample_data = fraud_df.iloc[under_sample_indices, :]

# now we have to divide under sampling data to all features & target
x_undersample_data = under_sample_data.drop(['Class'], axis=1)
y_undersample_data = under_sample_data[['Class']]
# now split dataset to train and test datasets as before
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(
x_undersample_data, y_undersample_data, test_size=0.2, random_state=0)

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

def decision_tree_classification(X_train, y_train, X_test, y_test):
 # initialize object for DecisionTreeClassifier class
 dt_classifier = DecisionTreeClassifier()
 # train model by using fit method
 print("Model training start........")
 dt_classifier.fit(X_train, y_train.values.ravel())
 print("Model training completed")
 acc_score = dt_classifier.score(X_test, y_test)
 print(f'Accuracy of model on test dataset :- {acc_score}')
 # predict result using test dataset
 y_pred = dt_classifier.predict(X_test)
 # confusion matrix
 print(f"Confusion Matrix :- \n {confusion_matrix(y_test, y_pred)}")
 # classification report for f1-score
 print(f"Classification Report :- \n {classification_report(y_test, y_pred)}")
 print(f"AROC score :- \n {roc_auc_score(y_test, y_pred)}")

# calling decision tree classifier function
decision_tree_classification(X_train_sample, y_train_sample,
X_test_sample, y_test_sample)

Model training start........
Model training completed
Accuracy of model on test dataset :- 0.9714285714285714
Confusion Matrix :- 
 [[17  1]
 [ 0 17]]
Classification Report :- 
               precision    recall  f1-score   support

         0.0       1.00      0.94      0.97        18
         1.0       0.94      1.00      0.97        17

    accuracy                           0.97        35
   macro avg       0.97      0.97      0.97        35
weighted avg       0.97      0.97      0.97        35

AROC score :- 
 0.9722222222222222
