In [65]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix,recall_score, precision_score,f1_score
from imblearn.over_sampling import SMOTE

# Load the dataset


In [None]:
df = pd.read_csv("./data/credit_card_transactions.csv")
df.head()

##  downsample dataset to 100,000 rows while preserving the original ratio of fraudulent vs non-fraudulent transactions.


In [None]:
# Define desired number of rows
n_rows = 100000

# Use stratified sampling
df = df.groupby("is_fraud", group_keys=False).apply(
    lambda x: x.sample(frac=n_rows / len(df), random_state=42)
)

# Check the new shape
print(df.shape)
# Check the ratio of fraud vs non-fraud
print(df["is_fraud"].value_counts(normalize=True))

(100000, 18)
is_fraud
0    0.99421
1    0.00579
Name: proportion, dtype: float64


  df = df.groupby("is_fraud", group_keys=False).apply(


In [None]:
df.value_counts()

Unnamed: 0  trans_date_trans_time  cc_num               merchant                             category       amt     first        last       gender  street                         city                      state  zip    lat      long       city_pop  job                               dob         trans_num                         unix_time   merch_lat  merch_long   is_fraud  merch_zipcode
0           2019-01-01 00:00:18    2703186189652095     fraud_Rippin, Kub and Mann           misc_net       4.97    Jennifer     Banks      F       561 Perry Cove                 Moravian Falls            NC     28654  36.0788  -81.1781   3495      Psychologist, counselling         1988-03-09  0b242abb623afc578575680df30655b9  1325376018  36.011293  -82.048315   0         28705.0          1
2           2019-01-01 00:00:51    38859492057661       fraud_Lind-Buckridge                 entertainment  220.11  Edward       Sanchez    M       594 White Dale Suite 530       Malad City                ID     83252

trans_date_trans_time – The date and time of the transaction, e.g., 2023-01-01 12:45:00.

cc_num – The credit card number used in the transaction. Sensitive info, usually dropped for modeling.

merchant – Name of the merchant/store where the transaction occurred.

category – Type of merchant or transaction category, e.g., "grocery", "electronics".

amt – Transaction amount in dollars (or local currency).

first – Cardholder’s first name. Usually dropped as it's non-predictive.

last – Cardholder’s last name. Same as above.

gender – Cardholder’s gender, e.g., male/female.

street – Cardholder’s street address. Often dropped for privacy.

city – Cardholder’s city.

state – Cardholder’s state.

zip – Cardholder’s ZIP/postal code.

lat – Latitude of cardholder’s location.

long – Longitude of cardholder’s location.

city_pop – Population of the cardholder’s city. Can be useful for demographic-based fraud analysis.

job – Cardholder’s occupation.

dob – Cardholder’s date of birth. Can be used to calculate age.

trans_num – Transaction number or ID, unique per transaction.

unix_time – Transaction time in Unix timestamp format (seconds since Jan 1, 1970).

merch_lat – Latitude of the merchant’s location.

merch_long – Longitude of the merchant’s location.

is_fraud – Target variable: 1 if fraudulent transaction, 0 otherwise.

merch_zipcode – ZIP code of the merchant.

# remove columns 'Unnamed: 0'

In [None]:
df = df.drop(columns=["Unnamed: 0"])
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,28705.0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,83236.0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,22844.0


In [None]:
df.loc[0,:]

trans_date_trans_time                 2019-01-01 00:00:18
cc_num                                   2703186189652095
merchant                       fraud_Rippin, Kub and Mann
category                                         misc_net
amt                                                  4.97
first                                            Jennifer
last                                                Banks
gender                                                  F
street                                     561 Perry Cove
city                                       Moravian Falls
state                                                  NC
zip                                                 28654
lat                                               36.0788
long                                             -81.1781
city_pop                                             3495
job                             Psychologist, counselling
dob                                            1988-03-09
trans_num     

In [None]:
# dissplay summary statistics
print(df.describe())

             cc_num           amt           zip           lat          long  \
count  1.296675e+06  1.296675e+06  1.296675e+06  1.296675e+06  1.296675e+06   
mean   4.171920e+17  7.035104e+01  4.880067e+04  3.853762e+01 -9.022634e+01   
std    1.308806e+18  1.603160e+02  2.689322e+04  5.075808e+00  1.375908e+01   
min    6.041621e+10  1.000000e+00  1.257000e+03  2.002710e+01 -1.656723e+02   
25%    1.800429e+14  9.650000e+00  2.623700e+04  3.462050e+01 -9.679800e+01   
50%    3.521417e+15  4.752000e+01  4.817400e+04  3.935430e+01 -8.747690e+01   
75%    4.642255e+15  8.314000e+01  7.204200e+04  4.194040e+01 -8.015800e+01   
max    4.992346e+18  2.894890e+04  9.978300e+04  6.669330e+01 -6.795030e+01   

           city_pop     unix_time     merch_lat    merch_long      is_fraud  \
count  1.296675e+06  1.296675e+06  1.296675e+06  1.296675e+06  1.296675e+06   
mean   8.882444e+04  1.349244e+09  3.853734e+01 -9.022646e+01  5.788652e-03   
std    3.019564e+05  1.284128e+07  5.109788e+00  1.

In [None]:
# checking missing values
print(df.isnull().sum())

trans_date_trans_time         0
cc_num                        0
merchant                      0
category                      0
amt                           0
first                         0
last                          0
gender                        0
street                        0
city                          0
state                         0
zip                           0
lat                           0
long                          0
city_pop                      0
job                           0
dob                           0
trans_num                     0
unix_time                     0
merch_lat                     0
merch_long                    0
is_fraud                      0
merch_zipcode            195973
dtype: int64


In [None]:
df.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud', 'merch_zipcode'],
      dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1296675 non-null  object 
 1   cc_num                 1296675 non-null  int64  
 2   merchant               1296675 non-null  object 
 3   category               1296675 non-null  object 
 4   amt                    1296675 non-null  float64
 5   first                  1296675 non-null  object 
 6   last                   1296675 non-null  object 
 7   gender                 1296675 non-null  object 
 8   street                 1296675 non-null  object 
 9   city                   1296675 non-null  object 
 10  state                  1296675 non-null  object 
 11  zip                    1296675 non-null  int64  
 12  lat                    1296675 non-null  float64
 13  long                   1296675 non-null  float64
 14  city_pop          

In [None]:
# Drop non-predictive columns
df = df.drop(
    ["cc_num", "trans_num", "first", "last", "street", "dob", "unix_time", "job"], axis=1)

In [None]:
df.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,city,state,zip,lat,long,city_pop,merch_lat,merch_long,is_fraud,merch_zipcode
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,Moravian Falls,NC,28654,36.0788,-81.1781,3495,36.011293,-82.048315,0,28705.0
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,Orient,WA,99160,48.8878,-118.2105,149,49.159047,-118.186462,0,
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,220.11,M,Malad City,ID,83252,42.1808,-112.262,4154,43.150704,-112.154481,0,83236.0
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,Boulder,MT,59632,46.2306,-112.1138,1939,47.034331,-112.561071,0,
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,41.96,M,Doe Hill,VA,24433,38.4207,-79.4629,99,38.674999,-78.632459,0,22844.0


In [None]:
# Handle missing values
df["merch_zipcode"] = df["merch_zipcode"].fillna(-1)  # Using -1 as a placeholder for missing values

# Feature Engineering

In [None]:
df["trans_date_trans_time"] = pd.to_datetime(df["trans_date_trans_time"])
df["trans_hour"] = df["trans_date_trans_time"].dt.hour
df["trans_dayofweek"] = df["trans_date_trans_time"].dt.dayofweek  # starting as monday=0
df["trans_month"] = df["trans_date_trans_time"].dt.month
df["trans_is_weekend"] = df["trans_dayofweek"].isin([5, 6]).astype(int)
df = df.drop("trans_date_trans_time", axis=1)

In [None]:
# List of categorical columns to encode
categorical_cols = [
    "merchant",
    "category",
    "gender",
    "city",
    "state",
    "zip",
    "merch_zipcode",
]

# Use LabelEncoder to transform categorical columns
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [None]:
df

Unnamed: 0,merchant,category,amt,gender,city,state,zip,lat,long,city_pop,merch_lat,merch_long,is_fraud,merch_zipcode,trans_hour,trans_dayofweek,trans_month,trans_is_weekend
0,514,8,4.97,0,526,27,265,36.0788,-81.1781,3495,36.011293,-82.048315,0,7823,0,1,1,0
1,241,4,107.23,0,612,47,965,48.8878,-118.2105,149,49.159047,-118.186462,0,0,0,1,1,0
2,390,0,220.11,1,468,13,858,42.1808,-112.2620,4154,43.150704,-112.154481,0,25054,0,1,1,0
3,360,2,45.00,1,84,26,614,46.2306,-112.1138,1939,47.034331,-112.561071,0,0,0,1,1,0
4,297,9,41.96,1,216,45,231,38.4207,-79.4629,99,38.674999,-78.632459,0,5965,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,499,0,15.56,1,330,44,868,37.7175,-112.4777,258,36.841266,-111.690765,0,0,12,6,6,1
1296671,2,1,51.70,1,813,20,203,39.2667,-77.5101,100,38.906881,-78.246528,0,5892,12,6,6,1
1296672,599,1,105.93,1,346,32,886,32.9396,-105.8189,899,33.619513,-105.130529,0,25880,12,6,6,1
1296673,509,1,74.90,1,471,41,601,43.3526,-102.5411,1126,42.788940,-103.241160,0,21356,12,6,6,1


In [None]:
# Separate features and target
X = df.drop("is_fraud", axis=1)
y = df["is_fraud"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:

X

Unnamed: 0,merchant,category,amt,gender,city,state,zip,lat,long,city_pop,merch_lat,merch_long,merch_zipcode,trans_hour,trans_dayofweek,trans_month,trans_is_weekend
0,514,8,4.97,0,526,27,265,36.0788,-81.1781,3495,36.011293,-82.048315,7823,0,1,1,0
1,241,4,107.23,0,612,47,965,48.8878,-118.2105,149,49.159047,-118.186462,0,0,1,1,0
2,390,0,220.11,1,468,13,858,42.1808,-112.2620,4154,43.150704,-112.154481,25054,0,1,1,0
3,360,2,45.00,1,84,26,614,46.2306,-112.1138,1939,47.034331,-112.561071,0,0,1,1,0
4,297,9,41.96,1,216,45,231,38.4207,-79.4629,99,38.674999,-78.632459,5965,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,499,0,15.56,1,330,44,868,37.7175,-112.4777,258,36.841266,-111.690765,0,12,6,6,1
1296671,2,1,51.70,1,813,20,203,39.2667,-77.5101,100,38.906881,-78.246528,5892,12,6,6,1
1296672,599,1,105.93,1,346,32,886,32.9396,-105.8189,899,33.619513,-105.130529,25880,12,6,6,1
1296673,509,1,74.90,1,471,41,601,43.3526,-102.5411,1126,42.788940,-103.241160,21356,12,6,6,1


In [None]:
y

0          0
1          0
2          0
3          0
4          0
          ..
1296670    0
1296671    0
1296672    0
1296673    0
1296674    0
Name: is_fraud, Length: 1296675, dtype: int64

# Model Training and MLflow Tracking

In [None]:
# Set up MLflow
mlflow.set_experiment("Credit Card Fraud Detection with RandomForest, XGBoost, LGBM")

def train_and_log_model(model, model_name, X_train, y_train, X_test, y_test, is_lgbm=False, cat_features=[]):
    """
    Trains a model using SMOTE, evaluates it, and logs the results to MLflow.
    Special handling for LightGBM's categorical features.
    """
    with mlflow.start_run(run_name=model_name):
        # Log model parameters
        mlflow.log_params(model.get_params())

        # Apply SMOTE to the training data
        smote = SMOTE(random_state=42)
        print(f"Applying SMOTE for {model_name}...")
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

        # Train the model
        print(f"Training {model_name}...")
        if is_lgbm:
            model.fit(X_train_resampled, y_train_resampled, categorical_feature=cat_features)
        else:
            model.fit(X_train_resampled, y_train_resampled)

        # Make predictions
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]

        # Calculate metrics
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_proba)
        
        # Log metrics to MLflow
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("roc_auc", roc_auc)
        
        # Log the trained model as an artifact
        mlflow.sklearn.log_model(model, "model")

        print(f"--- {model_name} Results ---")
        print(f"Recall: {recall:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"ROC-AUC: {roc_auc:.4f}")
        print("-" * 20)

# Initialize and run models
print("Starting Random Forest training...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
train_and_log_model(rf_model, "Random Forest", X_train, y_train, X_test, y_test)

print("Starting XGBoost training...")
xgb_model = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)
train_and_log_model(xgb_model, "XGBoost", X_train, y_train, X_test, y_test)

print("Starting LightGBM training...")
lgbm_model = LGBMClassifier(n_estimators=100, random_state=42, n_jobs=-1)
train_and_log_model(lgbm_model, "LightGBM", X_train, y_train, X_test, y_test, is_lgbm=True, cat_features=categorical_cols)

Starting Random Forest training...
Applying SMOTE for Random Forest...
Training Random Forest...


KeyboardInterrupt: 