In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

In [41]:
# Load data
df = pd.read_csv('fraud test.csv')
print("Sample of raw dataset:")
print(df.head())
df.info()

Sample of raw dataset:
   Unnamed: 0 trans_date_trans_time        cc_num  \
0           0      21/06/2020 12:14  2.291160e+15   
1           1      21/06/2020 12:14  3.573030e+15   
2           2      21/06/2020 12:14  3.598220e+15   
3           3      21/06/2020 12:15  3.591920e+15   
4           4      21/06/2020 12:15  3.526830e+15   

                               merchant        category    amt   first  \
0                 fraud_Kirlin and Sons   personal_care   2.86    Jeff   
1                  fraud_Sporer-Keebler   personal_care  29.84  Joanne   
2  fraud_Swaniawski, Nitzsche and Welch  health_fitness  41.28  Ashley   
3                     fraud_Haley Group        misc_pos  60.05   Brian   
4                 fraud_Johnston-Casper          travel   3.19  Nathan   

       last gender                       street  ...      lat      long  \
0   Elliott      M            351 Darlene Green  ...  33.9659  -80.9355   
1  Williams      F             3638 Marsh Union  ...  40.3207 -

DATA PREPROCESSING

In [42]:
# Using only 10,000 records for faster processing
df = df.sample(n=10000, random_state=42)

# Dropping columns
df.drop(['Unnamed: 0', 'first', 'last', 'street', 'city', 'zip', 'trans_num'], axis=1, inplace=True)

In [43]:
# Convert date columns to datetime format
df['dob'] = pd.to_datetime(df['dob'], dayfirst=True)
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], format='%d/%m/%Y %H:%M')
# Calculate age
df['age'] = df['trans_date_trans_time'].dt.year - df['dob'].dt.year
df['age'] -= ((df['trans_date_trans_time'].dt.month < df['dob'].dt.month) |
              ((df['trans_date_trans_time'].dt.month == df['dob'].dt.month) &
               (df['trans_date_trans_time'].dt.day < df['dob'].dt.day))).astype(int)

print("\nSample of age column:")
print(df['age'].head())


Sample of age column:
119106    55
179292    38
540729    66
374360    38
314574    49
Name: age, dtype: int64


In [44]:
# Check for missing values
print("\nChecking for missing values:")
print(df.isnull().sum())

# One-hot encode categorical features
categorical_cols = ['merchant', 'category', 'gender', 'job', 'state']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Normalize numerical columns
scaler = StandardScaler()
numeric_cols = ['amt', 'lat', 'long', 'city_pop', 'age', 'merch_lat', 'merch_long']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])


Checking for missing values:
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
gender                   0
state                    0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
age                      0
dtype: int64


In [45]:
# Feature selection and splitting
X_features = df.drop(columns=['is_fraud', 'dob', 'trans_date_trans_time'])
y_target = df['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=42, stratify=y_target)

# Apply SMOTE to balance data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

MODEL TRAINING

In [46]:
from sklearn.ensemble import GradientBoostingClassifier

gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
gb_classifier.fit(X_train_resampled, y_train_resampled)

MODEL EVALUATION

In [47]:
from sklearn.metrics import roc_auc_score

y_predictions = gb_classifier.predict(X_test)
y_prob_scores = gb_classifier.predict_proba(X_test)[:, 1]

print("\nReport:")
print(classification_report(y_test, y_predictions))

print("ROC AUC Score:", roc_auc_score(y_test, y_prob_scores))


Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1994
           1       0.31      0.67      0.42         6

    accuracy                           0.99      2000
   macro avg       0.65      0.83      0.71      2000
weighted avg       1.00      0.99      1.00      2000

ROC AUC Score: 0.761242059511869


TESTING INTERFACE

In [48]:

print("Fraud Prediction Tool")
print("Please provide the following transaction details:\n")

feature_list = X_features.columns.tolist()
manual_input_features = ['amt', 'lat', 'long', 'city_pop', 'age', 'merch_lat', 'merch_long']

user_input = {}
for item in manual_input_features:
    while True:
        try:
            value = float(input(f"Enter {item}: "))
            user_input[item] = value
            break
        except ValueError:
            print("Invalid input. Please enter a numeric value.")

# Create input DataFrame
input_data = {col: 0 for col in feature_list}
for key, val in user_input.items():
    input_data[key] = val

input_df = pd.DataFrame([input_data])

# Scale numeric input
input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])

# Make prediction
prediction_result = gb_classifier.predict(input_df)[0]

print("\nPrediction Result:")
if prediction_result == 1:
    print("This transaction is likely FRAUDULENT.")
else:
    print("This transaction appears LEGITIMATE.")

Fraud Prediction Tool
Please provide the following transaction details:

Enter amt: 120.45
Enter lat: 44
Enter long: 46
Enter city_pop: 4552355
Enter age: 34
Enter merch_lat: 45.65
Enter merch_long: -14.56

Prediction Result:
This transaction appears LEGITIMATE.
