In [11]:
# Create engine and connection to database
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_recall_curve, ConfusionMatrixDisplay
import xgboost as xgb
from xgboost import XGBClassifier


engine = create_engine("sqlite:///Database/Fraud_Transaction_Database.db")
conn = engine.connect()

In [13]:
# Query records from modeling dataframe
modeling_data = pd.read_sql("SELECT * FROM modeling_df", conn)

In [15]:
modeling_data.head()

Unnamed: 0,trxn_amt,qty,cust_age,fraudulent,acct_age_days,trxn_hour,year,month,day,pmt_method_PayPal,...,pmt_method_credit card,pmt_method_debit card,product_cat_clothing,product_cat_electronics,product_cat_health & beauty,product_cat_home & garden,product_cat_toys & games,cust_device_desktop,cust_device_mobile,cust_device_tablet
0,42.32,1,40,0,282,23,2024,3,24,True,...,False,False,False,True,False,False,False,True,False,False
1,301.34,3,35,0,223,0,2024,1,22,False,...,True,False,False,True,False,False,False,False,False,True
2,340.32,5,29,0,360,8,2024,1,22,False,...,False,True,False,False,False,False,True,True,False,False
3,95.77,5,45,0,325,20,2024,1,16,False,...,True,False,False,True,False,False,False,False,True,False
4,77.45,5,42,0,116,15,2024,1,16,False,...,True,False,True,False,False,False,False,True,False,False


In [17]:
# Define features and target
X = modeling_data.drop(columns=['fraudulent'])
y = modeling_data['fraudulent']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Features shape: (23634, 20)
Target shape: (23634,)


In [27]:
# Convert categorical data
#X['pmt_method_PayPal'] = X['pmt_method_PayPal'].astype(int)
#X['pmt_method_bank transfer'] = X['pmt_method_bank transfer'].astype(int)
#X['pmt_method_credit card'] = X['pmt_method_credit card'].astype(int)
#X['pmt_method_debit card'] = X['pmt_method_debit card'].astype(int)
#X['product_cat_clothing'] = X['product_cat_clothing'].astype(int)
#X['product_cat_electronics'] = X['product_cat_electronics'].astype(int)
#X['product_cat_health & beauty'] = X['product_cat_health & beauty'].astype(int)
#X['product_cat_home & garden'] = X['product_cat_home & garden'].astype(int)
#['product_cat_toys & games'] = X['product_cat_toys & games'].astype(int)
#X['cust_device_desktop'] = X['cust_device_desktop'].astype(int)
#X['cust_device_mobile'] = X['cust_device_mobile'].astype(int)
#X['cust_device_tablet'] = X['cust_device_tablet'].astype(int)


bool_columns = X.select_dtypes(include=['object']).columns
for col in bool_columns:
    X[col] = X[col].replace({'True': 1, 'False': 0}).astype(int)

X.head()

Unnamed: 0,trxn_amt,qty,cust_age,acct_age_days,trxn_hour,year,month,day,pmt_method_PayPal,pmt_method_bank transfer,pmt_method_credit card,pmt_method_debit card,product_cat_clothing,product_cat_electronics,product_cat_health & beauty,product_cat_home & garden,product_cat_toys & games,cust_device_desktop,cust_device_mobile,cust_device_tablet
0,42.32,1,40,282,23,2024,3,24,1,0,0,0,0,1,0,0,0,1,0,0
1,301.34,3,35,223,0,2024,1,22,0,0,1,0,0,1,0,0,0,0,0,1
2,340.32,5,29,360,8,2024,1,22,0,0,0,1,0,0,0,0,1,1,0,0
3,95.77,5,45,325,20,2024,1,16,0,0,1,0,0,1,0,0,0,0,1,0
4,77.45,5,42,116,15,2024,1,16,0,0,1,0,1,0,0,0,0,1,0,0


In [29]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [31]:
# Scale the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [105]:
# Initialize and Train the XGBoost Model
model = XGBClassifier(
    n_estimators=150,
    max_depth=5,
    scale_pos_weight=3,
    learning_rate=0.05,
    random_state=42
)

model.fit(X_train_scaled, y_train)

In [107]:
# Make predictions
y_pred = model.predict(X_test_scaled)

In [109]:
# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[4392   91]
 [ 161   83]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      4483
           1       0.48      0.34      0.40       244

    accuracy                           0.95      4727
   macro avg       0.72      0.66      0.68      4727
weighted avg       0.94      0.95      0.94      4727



## Model using Smote

In [112]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [199]:
# Initialize and Train the XGBoost Model with smote
model = XGBClassifier(
    n_estimators=150,
    max_depth=5,
    scale_pos_weight=3,
    learning_rate=0.05,
    random_state=42,
    
)

model.fit(X_train_resampled, y_train_resampled)

In [201]:
# Make Prediction
smote_y_pred = model.predict(X_test)

In [203]:
# Evaluate the model
print(classification_report(y_test, smote_y_pred))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      4483
           1       0.35      0.38      0.37       244

    accuracy                           0.93      4727
   macro avg       0.66      0.67      0.67      4727
weighted avg       0.93      0.93      0.93      4727



## Smote and RandomForest

In [206]:
from sklearn.ensemble import RandomForestClassifier

smote = SMOTE(random_state=42)
X_resampled: X
y_resampled: y
X_resampled, y_resampled = smote.fit_resample(X, y)

In [208]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [210]:
# intitialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [212]:
# Make predictions
y_pred = model.predict(X_test)

In [214]:
# Classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      4555
           1       0.99      0.96      0.98      4410

    accuracy                           0.98      8965
   macro avg       0.98      0.98      0.98      8965
weighted avg       0.98      0.98      0.98      8965

