In [None]:
!mkdir ~/ .kaggle
!cp kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d mlg-ulb/creditcardfraud
!unzip creditcardfraud.zip


In [None]:
import pandas as pd
df = pd.read_csv('creditcard.csv')
df.head()

In [None]:
df['Class'].value_counts()

In [None]:
df.hist(bins=30, figsize=(30,30))

In [None]:
df.describe()

In [None]:
from sklearn.preprocessing import RobustScaler
new_df = df.copy() #  copy the old df here
new_df['Amount'] = RobustScaler().fit_transform(new_df['Amount'].to_numpy().reshape(-1,1))
new_df['Amount'].hist()
time = new_df['Time']
new_df['Time'] = (time - time.min()) / (time.max() - time.min())
new_df


In [None]:
new_df['Amount'].describe

In [None]:
new_df = new_df.sample(frac=1, random_state=1)
new_df

In [None]:
train_np, test_np, val_np = new_df[:240000], new_df[240000:262000], new_df[262000:]
train_np['Class'].value_counts(), test_np['Class'].value_counts(), val_np['Class'].value_counts()


In [None]:
import pandas as pd
train, test, val = train_np, test_np, val_np
train.shape, test.shape, val.shape

In [None]:
x_train, y_train = train_np.iloc[:, :-1], train_np.iloc[:, -1]
x_test, y_test = test_np.iloc[:, :-1], test_np.iloc[:, -1]
x_val, y_val = val_np.iloc[:, :-1], val_np.iloc[:, -1]
x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_val.shape, y_val.shape


In [None]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression()
logistic_model.fit(x_train, y_train)
logistic_model.score(x_train, y_train)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_val, logistic_model.predict(x_val), target_names=['Not Fraud', 'Fraud']))


In [63]:
#fraud is the positive class
#not fraud is negative class

                 #Predicted Fraud (+)   Predicted Not Fraud (-)
#Fraud(+)        TP                     FN
#Not Fraud(-)    FP                     TN



Precision measures the proportion of correctly identified fraud cases (true positives) out of all the cases predicted as fraud (true positives + false positives).
It answers the question: "Of all the transactions the model flagged as fraud, how many were actually fraudulent?"

Recall measures the proportion of correctly identified fraud cases (true positives) out of all the actual fraud cases (true positives + false negatives).
This answers the question: "Of all the actual fraudulent transactions, how many did the model correctly identify?"

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint


shallow_nn = Sequential()
# The InputLayer expects a shape tuple, not just the number of features.
# Assuming you want a variable batch size, use (None, x_train.shape[1])
#x_train.shape[1], is basically the number of columns that x_train has
shallow_nn.add(InputLayer(input_shape=(x_train.shape[1],)))  # Fixed: Providing input_shape as a tuple
shallow_nn.add(Dense(2, 'relu'))
shallow_nn.add(BatchNormalization())
shallow_nn.add(Dense(1, activation='sigmoid')) # Outputs a sigmoid which is a probability btw 1 and 0.
#If it's 1 than 100% confident that its fraud

checkpoint = ModelCheckpoint('shallow_nn.keras', save_best_only=True) #Added .keras extension to the filepath
shallow_nn.compile(optimizer='adam', loss='binary_crossentrophy', metrics=['accuracy'])
shallow_nn
shallow_nn.summary

In [None]:
shallow_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
shallow_nn.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, callbacks=[checkpoint])

Epoch 1/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.9511 - loss: 0.1718 - val_accuracy: 0.9989 - val_loss: 0.0102
Epoch 2/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9993 - loss: 0.0035 - val_accuracy: 0.9989 - val_loss: 0.0134
Epoch 3/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0033 - val_accuracy: 0.9988 - val_loss: 0.0122
Epoch 4/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.9993 - loss: 0.0037 - val_accuracy: 0.9988 - val_loss: 0.0127
Epoch 5/5
[1m7500/7500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 1ms/step - accuracy: 0.9994 - loss: 0.0030 - val_accuracy: 0.9987 - val_loss: 0.0110


<keras.src.callbacks.history.History at 0x7d09a227e090>

In [None]:
def neural_net_predictions(model, x): # x is the input data
  return(shallow_nn.predict(x).flatten() > 0.5).astype(int)

neural_net_predictions(shallow_nn, x_val)

In [53]:
print(classification_report(y_val, neural_net_predictions(shallow_nn, x_val), target_names=['Not Fraud', 'Fraud']))

[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 711us/step
              precision    recall  f1-score   support

   Not Fraud       1.00      1.00      1.00     22771
       Fraud       0.56      0.75      0.64        36

    accuracy                           1.00     22807
   macro avg       0.78      0.87      0.82     22807
weighted avg       1.00      1.00      1.00     22807



if we go by f-1 which is a balance between precision and recall.

In [54]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=2, n_jobs=-1)
rf.fit(x_train,y_train)
print(classification_report(y_val,rf.predict(x_val), target_names=['Not Fraud', 'Fraud']))

              precision    recall  f1-score   support

   Not Fraud       1.00      1.00      1.00     22771
       Fraud       0.80      0.44      0.57        36

    accuracy                           1.00     22807
   macro avg       0.90      0.72      0.79     22807
weighted avg       1.00      1.00      1.00     22807



In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=50, max_depth=1, random_state=0)
gbc.fit(x_train, y_train)
print(classification_report(y_val, gbc.predict(x_val), target_names=['Not Fraud', 'Fraud']))

In [56]:
from sklearn.svm import LinearSVC
svc = LinearSVC(class_weight='balanced')
svc.fit(x_train, y_train)
print(classification_report(y_val, svc.predict(x_val), target_names=['Not Fraud', 'Fraud']))

              precision    recall  f1-score   support

   Not Fraud       1.00      0.98      0.99     22771
       Fraud       0.07      0.97      0.14        36

    accuracy                           0.98     22807
   macro avg       0.54      0.98      0.56     22807
weighted avg       1.00      0.98      0.99     22807



In [57]:
new_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
169876,0.693938,-0.611712,-0.769705,-0.149759,-0.224877,2.028577,-2.019887,0.292491,-0.52302,0.358468,...,-0.075208,0.045536,0.380739,0.02344,-2.220686,-0.201146,0.066501,0.22118,-0.282401,0
127467,0.453377,-0.814682,1.319219,1.329415,0.027273,-0.284871,-0.653985,0.321552,0.435975,-0.704298,...,-0.128619,-0.368565,0.09066,0.401147,-0.261034,0.080621,0.162427,0.059456,-0.279746,0
137900,0.47677,-0.318193,1.118618,0.969864,-0.127052,0.569563,-0.532484,0.706252,-0.064966,-0.463271,...,-0.305402,-0.774704,-0.123884,-0.495687,-0.018148,0.121679,0.24905,0.092516,-0.294977,0
21513,0.183556,-1.328271,1.018378,1.775426,-1.574193,-0.117696,-0.457733,0.681867,-0.031641,0.383872,...,-0.220815,-0.419013,-0.239197,0.009967,0.232829,0.814177,0.098797,-0.004273,-0.084119,0
134700,0.468326,1.276712,0.61712,-0.578014,0.879173,0.061706,-1.472002,0.373692,-0.287204,-0.084482,...,-0.160161,-0.430404,-0.076738,0.258708,0.55217,0.370701,-0.034255,0.041709,-0.296793,0


In [58]:
not_frauds = new_df.query('Class == 0')
frauds = new_df.query('Class == 1')
not_frauds['Class'].value_counts(), frauds['Class'].value_counts()


(Class
 0    284315
 Name: count, dtype: int64,
 Class
 1    492
 Name: count, dtype: int64)

In [59]:
balanced_df = pd.concat([frauds, not_frauds.sample(len(frauds), random_state=1)])
balanced_df['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
1,492
0,492


In [60]:
balanced_df = balanced_df.sample(frac=1, random_state=1)
balanced_df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
18372,0.170309,-1.762593,0.256143,1.683125,-1.279233,-1.902762,1.004210,-1.009748,-2.432546,0.458860,...,2.493579,0.320829,-0.535481,0.499401,-0.915196,-0.423434,0.107049,0.175922,2.906449,0
96341,0.380388,1.227614,-0.668974,-0.271785,-0.589440,-0.604795,-0.350285,-0.486365,-0.010809,-0.794944,...,-0.026055,-0.295255,-0.180459,-0.436539,0.494649,-0.283738,-0.001128,0.035075,1.062111,1
248296,0.890522,-0.613696,3.698772,-5.534941,5.620486,1.649263,-2.335145,-0.907188,0.706362,-3.747646,...,0.319261,-0.471379,-0.075890,-0.667909,-0.642848,0.070600,0.488410,0.292345,-0.307413,1
264328,0.933932,-0.011624,0.640413,0.868046,-0.505279,0.261938,0.223098,0.239049,0.150877,0.225142,...,0.069401,0.268024,0.261459,0.683742,-1.567901,-0.816674,0.185781,0.283021,-0.272619,0
208904,0.794730,-0.679341,1.217389,-0.316778,-1.086725,0.855349,-0.980760,0.970589,0.133116,-0.357671,...,-0.083048,-0.137032,-0.238920,-0.617244,0.039020,-0.081848,0.234633,0.128382,-0.307273,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81557,0.341393,-4.502731,-3.876484,1.341248,0.113400,0.189428,-0.560985,-0.140478,0.684651,0.475363,...,-0.140218,0.049411,2.313731,0.252330,0.307219,0.859051,0.184033,-0.308269,4.227625,0
276071,0.965803,2.091900,-0.757459,-1.192258,-0.755458,-0.620324,-0.322077,-1.082511,0.117200,-0.140927,...,0.288253,0.831939,0.142007,0.592615,-0.196143,-0.136676,0.020182,-0.015470,-0.028645,1
175971,0.709373,1.972989,0.157281,-1.715078,1.207451,0.681612,-0.615282,0.601791,-0.291935,-0.132265,...,0.098640,0.467533,-0.078973,-0.371882,0.486038,-0.490665,-0.018374,-0.070911,0.075735,0
27738,0.200727,-2.439237,2.591458,-2.840126,1.286244,-1.777016,-1.436139,-2.206056,-2.282725,-0.292885,...,1.774460,-0.771390,0.065727,0.103916,-0.057578,0.242652,-0.268649,-0.743713,1.443443,1


In [64]:
balanced_df_np = balanced_df.to_numpy()
x_train_balanced, y_train_balanced = balanced_df_np[:700, :-1], balanced_df_np[:700, -1]
x_test_balanced, y_test_balanced = balanced_df_np[700:842, :-1], balanced_df_np[700:842, -1]
x_val_balanced, y_val_balanced = balanced_df_np[842:, :-1], balanced_df_np[842:, -1].astype(int)
x_train_balanced.shape, x_test_balanced.shape, y_test_balanced.shape, x_val_balanced.shape, y_val_balanced.shape
# check for inbalance


((700, 30), (142, 30), (142,), (142, 30), (142,))

In [66]:
pd.Series(y_train).value_counts(), pd.Series(y_test_balanced).value_counts(), pd.Series(y_val_balanced).value_counts()

(Class
 0    239589
 1       411
 Name: count, dtype: int64,
 0.0    73
 1.0    69
 Name: count, dtype: int64,
 0    72
 1    70
 Name: count, dtype: int64)

In [67]:
logistic_model_balanced = LogisticRegression()
logistic_model_balanced.fit(x_train_balanced, y_train_balanced)
logistic_model_balanced.score(x_train_balanced, y_train_balanced)
print(classification_report(y_val_balanced, logistic_model_balanced.predict(x_val_balanced), target_names=['Not Fraud', 'Fraud']))

              precision    recall  f1-score   support

   Not Fraud       0.96      0.93      0.94        72
       Fraud       0.93      0.96      0.94        70

    accuracy                           0.94       142
   macro avg       0.94      0.94      0.94       142
weighted avg       0.94      0.94      0.94       142



In [68]:
shallow_nn_balanced = Sequential()
# The InputLayer expects a shape tuple, not just the number of features.
# Assuming we want a variable batch size, use (None, x_train.shape[1])
#x_train.shape[1], is basically the number of columns that x_train has
shallow_nn_balanced.add(InputLayer(input_shape=(x_train_balanced.shape[1],)))  # Fixed: Providing input_shape as a tuple
shallow_nn_balanced.add(Dense(2, 'relu'))
shallow_nn_balanced.add(BatchNormalization())
shallow_nn_balanced.add(Dense(1, activation='sigmoid')) # Outputs a sigmoid which is a probability btw 1 and 0.
#If it's 1 than 100% confident that its fraud

shallow_nn_balanced = Sequential()
# The InputLayer expects a shape tuple, not just the number of features.
# Assuming we want a variable batch size, use (None, x_train.shape[1])
#x_train.shape[1], is basically the number of columns that x_train has
shallow_nn_balanced.add(InputLayer(input_shape=(x_train_balanced.shape[1],)))  # Fixed: Providing input_shape as a tuple
shallow_nn_balanced.add(Dense(2, 'relu'))
shallow_nn_balanced.add(BatchNormalization())
shallow_nn_balanced.add(Dense(1, activation='sigmoid')) # Outputs a sigmoid which is a probability btw 1 and 0.
#If it's 1 than 100% confident that its fraud

checkpoint = ModelCheckpoint('shallow_nn_balanced.keras', save_best_only=True)
shallow_nn_balanced.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
shallow_nn_balanced.fit(x_train_balanced,y_train_balanced, validation_data=(x_val_balanced, y_val_balanced), epochs=40, callbacks=[checkpoint])
shallow_nn_balanced.summary

Epoch 1/40




[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.3602 - loss: 0.9554 - val_accuracy: 0.3239 - val_loss: 0.8260
Epoch 2/40
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.3455 - loss: 0.8933 - val_accuracy: 0.3662 - val_loss: 0.7926
Epoch 3/40
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.4321 - loss: 0.8246 - val_accuracy: 0.4437 - val_loss: 0.7459
Epoch 4/40
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5254 - loss: 0.7734 - val_accuracy: 0.5211 - val_loss: 0.6469
Epoch 5/40
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6221 - loss: 0.6102 - val_accuracy: 0.6620 - val_loss: 0.5520
Epoch 6/40
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6657 - loss: 0.5430 - val_accuracy: 0.7113 - val_loss: 0.4870
Epoch 7/40
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━

In [70]:
print(classification_report(y_val_balanced, neural_net_predictions(shallow_nn_balanced, x_val_balanced), target_names=['Not Fraud', 'Fraud']))

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
              precision    recall  f1-score   support

   Not Fraud       0.89      1.00      0.94        72
       Fraud       1.00      0.87      0.93        70

    accuracy                           0.94       142
   macro avg       0.94      0.94      0.94       142
weighted avg       0.94      0.94      0.94       142



In [81]:
shallow_nn_balanced_1relu = Sequential() #to help model overfitting  only use relu 1 instead of 2
#instead of regularization
shallow_nn_balanced_1relu.add(InputLayer(input_shape=(x_train_balanced.shape[1],)))  # Fixed: Providing input_shape as a tuple
shallow_nn_balanced_1relu.add(Dense(1,'relu'))
shallow_nn_balanced_1relu.add(BatchNormalization())
shallow_nn_balanced_1relu.add(Dense(1, activation='sigmoid')) # Outputs a sigmoid which is a probability btw 1 and 0.

checkpoint = ModelCheckpoint('shallow_nn_balanced_1relu.keras', save_best_only=True)
shallow_nn_balanced_1relu.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
shallow_nn_balanced_1relu.fit(x_train_balanced, y_train_balanced, validation_data=(x_val_balanced, y_val_balanced), epochs=40, callbacks=[checkpoint])


Epoch 1/40




[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.1991 - loss: 1.1963 - val_accuracy: 0.1479 - val_loss: 2.0525
Epoch 2/40
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.1858 - loss: 1.1753 - val_accuracy: 0.1338 - val_loss: 1.5778
Epoch 3/40
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.1896 - loss: 1.1374 - val_accuracy: 0.1197 - val_loss: 1.3661
Epoch 4/40
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.2312 - loss: 1.1017 - val_accuracy: 0.1268 - val_loss: 1.2333
Epoch 5/40
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.2270 - loss: 1.0744 - val_accuracy: 0.1620 - val_loss: 1.1227
Epoch 6/40
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.2499 - loss: 1.0509 - val_accuracy: 0.2324 - val_loss: 1.0186
Epoch 7/40
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7d09a27b3650>

In [72]:
print(classification_report(y_val_balanced, neural_net_predictions(shallow_nn_balanced_1relu, x_val_balanced), target_names=['Not Fraud', 'Fraud']))

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
              precision    recall  f1-score   support

   Not Fraud       0.89      1.00      0.94        72
       Fraud       1.00      0.87      0.93        70

    accuracy                           0.94       142
   macro avg       0.94      0.94      0.94       142
weighted avg       0.94      0.94      0.94       142



In [82]:
rf_b = RandomForestClassifier(max_depth=2, n_jobs=-1)
rf_b.fit(x_train_balanced, y_train_balanced)
print(classification_report(y_val_balanced, rf_b.predict(x_val_balanced), target_names=['Not Fraud', 'Fraud']))


              precision    recall  f1-score   support

   Not Fraud       0.92      0.97      0.95        72
       Fraud       0.97      0.91      0.94        70

    accuracy                           0.94       142
   macro avg       0.95      0.94      0.94       142
weighted avg       0.95      0.94      0.94       142



In [74]:
gbc_b = GradientBoostingClassifier(n_estimators=50, max_depth=1, random_state=0)
gbc_b.fit(x_train_balanced, y_train_balanced)
print(classification_report(y_val_balanced, gbc_b.predict(x_val_balanced), target_names=['Not Fraud', 'Fraud']))

              precision    recall  f1-score   support

   Not Fraud       0.93      0.94      0.94        72
       Fraud       0.94      0.93      0.94        70

    accuracy                           0.94       142
   macro avg       0.94      0.94      0.94       142
weighted avg       0.94      0.94      0.94       142



In [75]:
svc_b = LinearSVC(class_weight='balanced')
svc_b.fit(x_train_balanced, y_train_balanced)
print(classification_report(y_val_balanced, svc_b.predict(x_val_balanced), target_names=['Not Fraud', 'Fraud']))

              precision    recall  f1-score   support

   Not Fraud       0.96      0.93      0.94        72
       Fraud       0.93      0.96      0.94        70

    accuracy                           0.94       142
   macro avg       0.94      0.94      0.94       142
weighted avg       0.94      0.94      0.94       142

