In [36]:
# Install if not already available
!pip install -q pandas scikit-learn matplotlib seaborn


In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# **Load DATASET**

In [38]:
df = pd.read_csv('fraudTrain.csv', encoding='latin1', on_bad_lines='skip')
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495.0,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376000.0,36.011293,-82.048315,0.0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149.0,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376000.0,49.159047,-118.186462,0.0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154.0,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376000.0,43.150704,-112.154481,0.0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939.0,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376000.0,47.034331,-112.561071,0.0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99.0,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376000.0,38.674999,-78.632459,0.0


In [39]:
# Basic info
print(df.shape)
print(df.columns)
df.head()
df.info()


(124361, 23)
Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124361 entries, 0 to 124360
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             124361 non-null  int64  
 1   trans_date_trans_time  124361 non-null  object 
 2   cc_num                 124361 non-null  int64  
 3   merchant               124361 non-null  object 
 4   category               124361 non-null  object 
 5   amt                    124361 non-null  float64
 6   first                  124361 non-null  object 
 7   last                   124361 non-null  object 
 8   gender                 124361 non-null  obj

# **Data Preprocessing**

In [40]:
# Check structure
df.info()

# Check missing values
print(df.isnull().sum())

# Check target variable
print(df['is_fraud'].value_counts())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124361 entries, 0 to 124360
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             124361 non-null  int64  
 1   trans_date_trans_time  124361 non-null  object 
 2   cc_num                 124361 non-null  int64  
 3   merchant               124361 non-null  object 
 4   category               124361 non-null  object 
 5   amt                    124361 non-null  float64
 6   first                  124361 non-null  object 
 7   last                   124361 non-null  object 
 8   gender                 124361 non-null  object 
 9   street                 124361 non-null  object 
 10  city                   124360 non-null  object 
 11  state                  124360 non-null  object 
 12  zip                    124360 non-null  float64
 13  lat                    124360 non-null  float64
 14  long                   124360 non-nu

# **Drop rows with missing**

In [41]:
df = df.dropna(subset=['is_fraud'])  # Remove rows where target is missing


# **3.3 Drop unnecessary columns
**

In [42]:
df = df.drop(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'first', 'last',
              'street', 'city', 'state', 'job', 'dob', 'trans_num'], axis=1)


# **Encode categorical features**

In [43]:
le = LabelEncoder()

df['gender'] = le.fit_transform(df['gender'])       # M = 1, F = 0
df['category'] = le.fit_transform(df['category'])   # Encode category
df['merchant'] = le.fit_transform(df['merchant'])   # Encode merchant


# **Prepare Data for Training**

In [44]:
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

# Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y)


# **Logistic Regression**

In [45]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("📘 Logistic Regression:")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))


📘 Logistic Regression:
[[36926    34]
 [  348     0]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     36960
         1.0       0.00      0.00      0.00       348

    accuracy                           0.99     37308
   macro avg       0.50      0.50      0.50     37308
weighted avg       0.98      0.99      0.99     37308

Accuracy: 0.9897609091883778


# **Decision Tree Classifier**

In [46]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

print("🌳 Decision Tree:")
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))
print("Accuracy:", accuracy_score(y_test, y_pred_dt))


🌳 Decision Tree:
[[36816   144]
 [   92   256]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     36960
         1.0       0.64      0.74      0.68       348

    accuracy                           0.99     37308
   macro avg       0.82      0.87      0.84     37308
weighted avg       0.99      0.99      0.99     37308

Accuracy: 0.9936742789750188


# **Random Forest Classifier**

In [47]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("🌲 Random Forest:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))


🌲 Random Forest:
[[36912    48]
 [  129   219]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     36960
         1.0       0.82      0.63      0.71       348

    accuracy                           1.00     37308
   macro avg       0.91      0.81      0.85     37308
weighted avg       0.99      1.00      0.99     37308

Accuracy: 0.995255709231264


# **Compare Model Accuracies**

In [48]:
print("📊 Model Accuracy Comparison:")
print("Logistic Regression:", accuracy_score(y_test, y_pred_lr))
print("Decision Tree:", accuracy_score(y_test, y_pred_dt))
print("Random Forest:", accuracy_score(y_test, y_pred_rf))


📊 Model Accuracy Comparison:
Logistic Regression: 0.9897609091883778
Decision Tree: 0.9936742789750188
Random Forest: 0.995255709231264
