In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../dataset/synthetic_credit_card_fraud.csv")


In [3]:
df

Unnamed: 0,transaction_id,amount,transaction_type,merchant_category,customer_age,card_type,country,hour,device,is_fraud
0,TXN100000,abc,Online,Electronics,26.0,Debit,canada,1,web,1
1,TXN100001,5265.36,ATM,food,94.0,credit,UK,-2,web,1
2,TXN100002,326.89,online,food,90.0,credit,UAE,17,,0
3,TXN100003,abc,pos,Electronics,2.0,CREDIT,usa,-5,,1
4,TXN100004,abc,ONLINE,Electronics,60.0,Debit,UK,-4,,0
...,...,...,...,...,...,...,...,...,...,...
4995,TXN104995,abc,Online,grocry,6.0,debit,usa,-5,POS,1
4996,TXN104996,655.09,POS,Travel,4.0,credit,USA,-1,POS,1
4997,TXN104997,979.19,ONLINE,electronics,95.0,Credit,UK,8,POS,0
4998,TXN104998,abc,pos,Travel,,debit,UK,24,,0


In [4]:
# Exploratory Data Analysis(EDA)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   transaction_id     5000 non-null   object 
 1   amount             3717 non-null   object 
 2   transaction_type   5000 non-null   object 
 3   merchant_category  5000 non-null   object 
 4   customer_age       3733 non-null   float64
 5   card_type          5000 non-null   object 
 6   country            5000 non-null   object 
 7   hour               5000 non-null   int64  
 8   device             2937 non-null   object 
 9   is_fraud           5000 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 390.8+ KB


In [6]:
df.describe()

Unnamed: 0,customer_age,hour,is_fraud
count,3733.0,5000.0,5000.0
mean,47.643718,11.9218,0.4794
std,36.374887,12.933845,0.499625
min,1.0,-5.0,0.0
25%,8.0,-2.0,0.0
50%,48.0,12.0,0.0
75%,87.0,25.0,1.0
max,95.0,30.0,1.0


In [7]:
# Data Cleaning 

In [8]:
df.drop(columns=["transaction_id"],inplace=True)

In [9]:
df

Unnamed: 0,amount,transaction_type,merchant_category,customer_age,card_type,country,hour,device,is_fraud
0,abc,Online,Electronics,26.0,Debit,canada,1,web,1
1,5265.36,ATM,food,94.0,credit,UK,-2,web,1
2,326.89,online,food,90.0,credit,UAE,17,,0
3,abc,pos,Electronics,2.0,CREDIT,usa,-5,,1
4,abc,ONLINE,Electronics,60.0,Debit,UK,-4,,0
...,...,...,...,...,...,...,...,...,...
4995,abc,Online,grocry,6.0,debit,usa,-5,POS,1
4996,655.09,POS,Travel,4.0,credit,USA,-1,POS,1
4997,979.19,ONLINE,electronics,95.0,Credit,UK,8,POS,0
4998,abc,pos,Travel,,debit,UK,24,,0


In [10]:
# Fixing the amount column like converting to numeric 

In [11]:
df["amount"] = pd.to_numeric(df["amount"],errors =  "coerce")

In [12]:
# Filling up the amount missing values to median 

In [13]:
median_amount = df["amount"].median()
df["amount"] = df["amount"].fillna(median_amount)

In [14]:
# Fix the Case-unconsistency 

In [15]:
cat_cols = [
    "transaction_type",
    "merchant_category",
    "card_type",
    "country",
    "device"
]

for col in cat_cols:
    df[col] = df[col].str.lower()


In [16]:
# Handle missing device

In [17]:
df["device"] = df["device"].fillna("unknown")

In [18]:
# Fix hour

In [19]:
hour_median = df["hour"].median()
df.loc[(df["hour"] < 0) | (df["hour"] > 23), "hour"] = hour_median


In [20]:
# Fix customer_age

In [25]:
age_median = df["customer_age"].median()
df.loc[(df["customer_age"] < 18) | (df["customer_age"] > 90), "customer_age"] = age_median
df["customer_age"].fillna(age_median,inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["customer_age"].fillna(age_median,inplace=True)


In [26]:
# checking my data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   amount             5000 non-null   float64
 1   transaction_type   5000 non-null   object 
 2   merchant_category  5000 non-null   object 
 3   customer_age       5000 non-null   float64
 4   card_type          5000 non-null   object 
 5   country            5000 non-null   object 
 6   hour               5000 non-null   int64  
 7   device             5000 non-null   object 
 8   is_fraud           5000 non-null   int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 351.7+ KB


In [27]:
df.describe()

Unnamed: 0,amount,customer_age,hour,is_fraud
count,5000.0,5000.0,5000.0,5000.0
mean,5260.00191,53.4684,11.864,0.4794
std,3593.13125,16.148807,4.031272,0.499625
min,5.85,18.0,0.0,0.0
25%,5078.71,48.0,12.0,0.0
50%,5078.71,48.0,12.0,0.0
75%,5078.71,50.0,12.0,1.0
max,14993.24,90.0,23.0,1.0


In [28]:
df.isna().sum()

amount               0
transaction_type     0
merchant_category    0
customer_age         0
card_type            0
country              0
hour                 0
device               0
is_fraud             0
dtype: int64

In [30]:
df

Unnamed: 0,amount,transaction_type,merchant_category,customer_age,card_type,country,hour,device,is_fraud
0,5078.71,online,electronics,26.0,debit,canada,1,web,1
1,5265.36,atm,food,48.0,credit,uk,12,web,1
2,326.89,online,food,90.0,credit,uae,17,unknown,0
3,5078.71,pos,electronics,48.0,credit,usa,12,unknown,1
4,5078.71,online,electronics,60.0,debit,uk,12,unknown,0
...,...,...,...,...,...,...,...,...,...
4995,5078.71,online,grocry,48.0,debit,usa,12,pos,1
4996,655.09,pos,travel,48.0,credit,usa,12,pos,1
4997,979.19,online,electronics,48.0,credit,uk,8,pos,0
4998,5078.71,pos,travel,48.0,debit,uk,12,unknown,0


In [31]:
# FEATURE ENCODING

In [32]:
categorical_cols = [
    "transaction_type",
    "merchant_category",
    "card_type",
    "country",
    "device"
]


In [33]:
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [34]:
# CHECKING THE ENCODING

In [35]:
df_encoded.head()

Unnamed: 0,amount,customer_age,hour,is_fraud,transaction_type_online,transaction_type_pos,merchant_category_entertainment,merchant_category_food,merchant_category_groceries,merchant_category_grocry,merchant_category_travel,card_type_debit,country_india,country_uae,country_uk,country_usa,device_pos,device_unknown,device_web
0,5078.71,26.0,1,1,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True
1,5265.36,48.0,12,1,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True
2,326.89,90.0,17,0,True,False,False,True,False,False,False,False,False,True,False,False,False,True,False
3,5078.71,48.0,12,1,False,True,False,False,False,False,False,False,False,False,False,True,False,True,False
4,5078.71,60.0,12,0,True,False,False,False,False,False,False,True,False,False,True,False,False,True,False


In [36]:
df.tail()

Unnamed: 0,amount,transaction_type,merchant_category,customer_age,card_type,country,hour,device,is_fraud
4995,5078.71,online,grocry,48.0,debit,usa,12,pos,1
4996,655.09,pos,travel,48.0,credit,usa,12,pos,1
4997,979.19,online,electronics,48.0,credit,uk,8,pos,0
4998,5078.71,pos,travel,48.0,debit,uk,12,unknown,0
4999,359.38,pos,travel,48.0,credit,uk,2,unknown,1


In [39]:
# Split Features & Target

In [40]:
X = df_encoded.drop(columns=["is_fraud"])
y = df_encoded["is_fraud"]

In [49]:
# Train / Test Split

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# scale
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# model
model = LogisticRegression(
    max_iter=1000,
    solver="lbfgs"
)

model.fit(X_train, y_train)


In [51]:
df.tail()

Unnamed: 0,amount,transaction_type,merchant_category,customer_age,card_type,country,hour,device,is_fraud
4995,5078.71,online,grocry,48.0,debit,usa,12,pos,1
4996,655.09,pos,travel,48.0,credit,usa,12,pos,1
4997,979.19,online,electronics,48.0,credit,uk,8,pos,0
4998,5078.71,pos,travel,48.0,debit,uk,12,unknown,0
4999,359.38,pos,travel,48.0,credit,uk,2,unknown,1


In [52]:
df_encoded.tail()

Unnamed: 0,amount,customer_age,hour,is_fraud,transaction_type_online,transaction_type_pos,merchant_category_entertainment,merchant_category_food,merchant_category_groceries,merchant_category_grocry,merchant_category_travel,card_type_debit,country_india,country_uae,country_uk,country_usa,device_pos,device_unknown,device_web
4995,5078.71,48.0,12,1,True,False,False,False,False,True,False,True,False,False,False,True,True,False,False
4996,655.09,48.0,12,1,False,True,False,False,False,False,True,False,False,False,False,True,True,False,False
4997,979.19,48.0,8,0,True,False,False,False,False,False,False,False,False,False,True,False,True,False,False
4998,5078.71,48.0,12,0,False,True,False,False,False,False,True,True,False,False,True,False,False,True,False
4999,359.38,48.0,2,1,False,True,False,False,False,False,True,False,False,False,True,False,False,True,False


In [53]:
# MODEL EVALUATION

In [54]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[376 122]
 [143 359]]

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.76      0.74       498
           1       0.75      0.72      0.73       502

    accuracy                           0.73      1000
   macro avg       0.74      0.74      0.73      1000
weighted avg       0.74      0.73      0.73      1000



In [55]:
# Train Random Forest:

In [56]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced"
)

rf.fit(X_train, y_train)


In [67]:
from sklearn.metrics import confusion_matrix, classification_report

# 1. Get probabilities for fraud (class 1)
y_probs = rf.predict_proba(X_test)[:, 1]

# # 2. Choose a threshold
# threshold = 0.35   # you can change this later

# # 3. Convert probabilities to class labels (0 or 1)
# y_pred_rf = (y_probs >= threshold).astype(int)

# # 4. Evaluate the model
# print("Confusion Matrix:")
# print(confusion_matrix(y_test, y_pred_rf))

# print("\nClassification Report:")
# print(classification_report(y_test, y_pred_rf))



Threshold: 0.5
[[418  80]
 [138 364]]
              precision    recall  f1-score   support

           0       0.75      0.84      0.79       498
           1       0.82      0.73      0.77       502

    accuracy                           0.78      1000
   macro avg       0.79      0.78      0.78      1000
weighted avg       0.79      0.78      0.78      1000


Threshold: 0.4
[[373 125]
 [103 399]]
              precision    recall  f1-score   support

           0       0.78      0.75      0.77       498
           1       0.76      0.79      0.78       502

    accuracy                           0.77      1000
   macro avg       0.77      0.77      0.77      1000
weighted avg       0.77      0.77      0.77      1000


Threshold: 0.35
[[354 144]
 [ 85 417]]
              precision    recall  f1-score   support

           0       0.81      0.71      0.76       498
           1       0.74      0.83      0.78       502

    accuracy                           0.77      1000
   macro a

In [72]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    scale_pos_weight=1.5,
    random_state=42,
    eval_metric="logloss"   # IMPORTANT: avoids warning
)

xgb.fit(X_train, y_train)


In [73]:
y_probs_xgb = xgb.predict_proba(X_test)[:, 1]


In [71]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-macosx_12_0_arm64.whl (2.2 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m[31m2.5 MB/s[0m eta [36m0:00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-3.1.2
Note: you may need to restart the kernel to use updated packages.


In [76]:
# Final production 

THRESHOLD = 0.35

def predict_fraud_xgb(model, X):
    probs = model.predict_proba(X)[:, 1]
    return (probs >= THRESHOLD).astype(int)



In [77]:
# Save the model 

import joblib

# Save trained XGBoost model
joblib.dump(xgb, "fraud_model.pkl")

# Save the scaler used during training
joblib.dump(scaler, "scaler.pkl")

print("Model and scaler saved successfully!")


Model and scaler saved successfully!


In [78]:
import os
os.listdir(".")

['scaler.pkl', 'fraud_model.pkl', '.ipynb_checkpoints', 'Credit_Card.ipynb']