In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score


In [3]:
# Load the Data
train_df = pd.read_csv("fraudTrain.csv")
test_df = pd.read_csv("fraudTest.csv")


In [4]:
#Explore the Data
print("Training shape:", train_df.shape)
print("Testing shape:", test_df.shape)

print(train_df.columns)
train_df.head()
train_df['is_fraud'].value_counts()


Training shape: (1296675, 23)
Testing shape: (555719, 23)
Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')


is_fraud
0    1289169
1       7506
Name: count, dtype: int64

In [5]:
#Drop unwanted columns
cols_to_drop = ['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'first', 'last',
                'street', 'city', 'state', 'zip', 'dob', 'trans_num', 'unix_time']

train_df.drop(columns=cols_to_drop, inplace=True)
test_df.drop(columns=cols_to_drop, inplace=True)


In [6]:
#Encode categorical columns
from sklearn.preprocessing import LabelEncoder

label_cols = ['merchant', 'category', 'gender', 'job']

for col in label_cols:
    le = LabelEncoder()
    
    # Convert both train and test columns to string before combining
    combined = pd.concat([
        train_df[col].astype(str),
        test_df[col].astype(str)
    ], axis=0)

    le.fit(combined)  # Fit on combined string data

    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))


In [7]:
#Scale Numerical Columns
from sklearn.preprocessing import StandardScaler

scale_cols = ['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long']

scaler = StandardScaler()

# Fit on training, transform both
train_df[scale_cols] = scaler.fit_transform(train_df[scale_cols])
test_df[scale_cols] = scaler.transform(test_df[scale_cols])


In [8]:
# Prepare Features (X) and Labels (y)
X_train = train_df.drop('is_fraud', axis=1)
y_train = train_df['is_fraud']

X_test = test_df.drop('is_fraud', axis=1)
y_test = test_df['is_fraud']


In [13]:
#decision Tree
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier(class_weight='balanced', random_state=42)
tree_model.fit(X_train_scaled, y_train)


In [14]:
#Perdict and Evaluate
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = tree_model.predict(X_test_scaled)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


Confusion Matrix:
 [[552592    982]
 [   961   1184]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.55      0.55      0.55      2145

    accuracy                           1.00    555719
   macro avg       0.77      0.78      0.77    555719
weighted avg       1.00      1.00      1.00    555719

Accuracy: 0.9965036286324563


In [15]:
import joblib
joblib.dump(tree_model, "fraud_detection_model.pkl")


['fraud_detection_model.pkl']