In [4]:
import pandas as pd
import numpy as np



df = pd.read_csv('model_smote.csv')

In [103]:
df.sample(100)

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
6312870,0.789524,1,0.935006,6.745723e+05,0.00,7.154180e+05,1.389990e+06,1
6937705,-1.099984,1,0.017448,1.908129e+05,0.00,0.000000e+00,1.908129e+05,1
4565183,-0.673457,1,-0.292594,2.735106e+04,0.00,2.060976e+05,2.334487e+05,1
5804812,0.745262,1,-0.102915,1.273546e+05,0.00,0.000000e+00,1.273546e+05,1
7496,-1.832698,2,-0.344143,1.015206e+05,101347.09,0.000000e+00,0.000000e+00,0
...,...,...,...,...,...,...,...,...
3262336,0.754298,2,-0.325447,5.090000e+03,0.00,0.000000e+00,0.000000e+00,0
8284647,-0.537459,4,2.036125,1.255109e+06,0.00,0.000000e+00,0.000000e+00,1
4304845,1.530397,1,0.059632,9.782030e+05,765149.39,0.000000e+00,2.130536e+05,0
2147779,0.055809,3,-0.130085,1.982110e+05,311240.83,1.614891e+05,4.845925e+04,0


# RANDOM FOREST CLASSIFIER

In [113]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression 
from xgboost import XGBClassifier

In [9]:

X = df.iloc[:, :7]# Features
y = df.iloc[:, -1]# Target Variable

assert X.shape[0] == y.shape[0]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create the random forest classifier
clf = RandomForestClassifier()

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions with the classifier on the test data
predictions = clf.predict(X_test)

# Ensure that y_true and y_pred have the same number of rows
assert y_test.shape[0] == predictions.shape[0], "y_true and y_pred must have the same number of rows"

# Generate the confusion matrix
confusion_matrix = confusion_matrix(y_test, predictions)

# Print the confusion matrix values
print("True positives:", confusion_matrix[0][0])
print("True negatives:", confusion_matrix[1][1])
print("False positives:", confusion_matrix[0][1])
print("False negatives:", confusion_matrix[1][0])

True positives: 884321
True negatives: 783624
False positives: 2939
False negatives: 205


In [11]:
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

# Print the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

Accuracy: 0.9981185921276485
Precision: 0.9962634906549126
Recall: 0.9997384633638204
F1 score: 0.9979979521036786


In [12]:
from joblib import dump, load
dump(clf, 'with_smote.pkl')

['with_smote.pkl']

In [29]:
clf = load('with_smote.pkl')
# Make predictions with the loaded model
predictions = clf.predict(X_test)

In [30]:
predictions

array([0, 1, 0, ..., 1, 1, 1], dtype=int64)

In [111]:
features = np.array([[1, 1, -1000,181.0, 0.0, 0.0,00.60]])

# Make predictions with the loaded model
predictions = clf.predict(features)

# Print the predicted target variable
print(predictions)

[1]




# LOGISTIC REGRESSION

In [63]:
def evaluate_model(y_test, y_pred):
    print("Accuracy Score: ", accuracy_score(y_test, y_pred))
    print("Precision Score: ", precision_score(y_test, y_pred))
    print("Recall Score: ", recall_score(y_test, y_pred))
    print("F1 Score: ", f1_score(y_test, y_pred))
    print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))

In [66]:
X = df.iloc[:, :7]# Features
y = df.iloc[:, -1]# Target


# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

lr = LogisticRegression()
model1 = lr.fit(X_train, y_train)
 
# Predict on training set
lr_pred = model1.predict(X_test)
evaluate_model(y_test, lr_pred)

Accuracy Score:  0.880521011211373
Precision Score:  0.8239718330828347
Recall Score:  0.9480567947181932
F1 Score:  0.8816698362034592
Confusion Matrix:  [[909503 198633]
 [ 50942 929783]]


# XGBoost Classifier

In [115]:
X = df.iloc[:, :7]# Features
y = df.iloc[:, -1]# Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a classifier on the training data
clf = XGBClassifier()
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))


Accuracy: 0.9773808132263478
Precision: 0.975758816230544
Recall: 0.9760804573385973
F1 score: 0.9759196102831563
Confusion Matrix:  [[1301019   28543]
 [  28155 1148916]]
