In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from datetime import datetime
import ipaddress
import joblib

In [2]:
# stop warnings
import warnings 
warnings.filterwarnings('ignore')

In [3]:
# # Get the full path to the scripts directory
# sys.path.append(os.path.abspath(os.path.abspath('../scripts')))

# from Model_training import load_data, preprocess_data, split_data,train_model, evaluate_model, save_model

In [3]:
data = pd.read_csv('../data/fraud_data.csv')

In [None]:
# # Load the datasets
# data = load_data('../data/Fraud_Data.csv')

In [5]:
data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user_id         151112 non-null  int64  
 1   signup_time     151112 non-null  object 
 2   purchase_time   151112 non-null  object 
 3   purchase_value  151112 non-null  int64  
 4   device_id       151112 non-null  object 
 5   source          151112 non-null  object 
 6   browser         151112 non-null  object 
 7   sex             151112 non-null  object 
 8   age             151112 non-null  int64  
 9   ip_address      151112 non-null  float64
 10  class           151112 non-null  int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 12.7+ MB


In [7]:
# Check the first few rows of the dataset
data.describe()

Unnamed: 0,user_id,purchase_value,age,ip_address,class
count,151112.0,151112.0,151112.0,151112.0,151112.0
mean,200171.04097,36.935372,33.140704,2152145000.0,0.093646
std,115369.285024,18.322762,8.617733,1248497000.0,0.291336
min,2.0,9.0,18.0,52093.5,0.0
25%,100642.5,22.0,27.0,1085934000.0,0.0
50%,199958.0,35.0,33.0,2154770000.0,0.0
75%,300054.0,49.0,39.0,3243258000.0,0.0
max,400000.0,154.0,76.0,4294850000.0,1.0


In [8]:
data.isnull().sum()

user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64

In [9]:
data['signup_time'] = pd.to_datetime(data['signup_time'])
data['purchase_time'] = pd.to_datetime(data['purchase_time'])
data['time_to_purchase'] = (data['purchase_time'] - data['signup_time']).dt.total_seconds()
data.drop(['signup_time', 'purchase_time'], axis=1, inplace=True)

In [10]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
categorical_columns = ['device_id', 'source', 'browser', 'sex']
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])


In [11]:
X = data.drop('class', axis=1)
y = data['class']


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [16]:
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data; only fit the scaler on training data to avoid data leakage
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

numerical_columns = ['purchase_value', 'age', 'time_to_purchase']  # List of numerical columns to scale
X_train_scaled[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test_scaled[numerical_columns] = scaler.transform(X_test[numerical_columns])


In [17]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train_scaled, y_train)


In [18]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print classification report and confusion matrix
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.91
Classification Report:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95     41117
           1       0.00      0.00      0.00      4217

    accuracy                           0.91     45334
   macro avg       0.45      0.50      0.48     45334
weighted avg       0.82      0.91      0.86     45334

Confusion Matrix:
 [[41117     0]
 [ 4217     0]]


In [19]:
importances = rf_model.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)
print(importance_df)


            Feature  Importance
8  time_to_purchase    0.596430
2         device_id    0.090138
7        ip_address    0.089536
0           user_id    0.088680
1    purchase_value    0.056315
6               age    0.046673
4           browser    0.016865
5               sex    0.008365
3            source    0.006998


In [20]:
import os

# Create the models directory if it doesn't exist
os.makedirs("models", exist_ok=True)

# Save the Random Forest model
joblib.dump(rf_model, 'models/random_forest_model.pkl')

# Save the scaler
joblib.dump(scaler, 'models/scaler.pkl')



['models/scaler.pkl']