In [1]:
import numpy as np
import pandas as pd 
import joblib


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
df=pd.read_csv("C:/Users/aryab/fraud-detection-project/ml-service/data/transactions.csv")
df["TX_DATETIME"]=pd.to_datetime(df["TX_DATETIME"])
df["hour_of_day"]=df["TX_DATETIME"].dt.hour
df["day_of_week"]=df["TX_DATETIME"].dt.dayofweek


def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in kilometers

    # 1. Convert degrees to radians
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)

    # 2. Apply the Haversine formula
    a = np.sin(dphi / 2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(dlambda / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    # 3. Return distance in km
    return R * c

# Calculate distance between home location and transaction location
df['dist_from_home'] = haversine_distance(
    df['home_lat'], 
    df['home_lon'], 
    df['latitude'], 
    df['longitude']
)

# Check the results (especially the high distance ones!)

df=df.sort_values(["CUSTOMER_ID","TX_DATETIME"])

df["TIME_DIFF"]=df.groupby("CUSTOMER_ID")["TX_DATETIME"].diff()

df["TIME_SINCE_LAST_SEC"]=df["TIME_DIFF"].dt.total_seconds()
df["TIME_DIFF"]=df["TIME_DIFF"].fillna(0)
df["TIME_SINCE_LAST_SEC"]=df["TIME_SINCE_LAST_SEC"].fillna(0)
df["prev_lat"]=df.groupby("CUSTOMER_ID")["latitude"].shift(1)
df["prev_lon"]=df.groupby("CUSTOMER_ID")["longitude"].shift(1)
df["dist_from_last"]=haversine_distance(
    df['latitude'],
    df['longitude'],
    df['prev_lat'],
    df['prev_lon']
)
df["dist_from_last"]=df["dist_from_last"].fillna(0)
df["prev_lat"]=df["prev_lat"].fillna(0)
df["prev_lon"]=df["prev_lon"].fillna(0)

df["speed_kmh"]=df["dist_from_last"]/((df["TIME_SINCE_LAST_SEC"]+1)/3600)

X=df[["TX_AMOUNT","speed_kmh","hour_of_day","day_of_week","dist_from_home","TIME_SINCE_LAST_SEC"]]
y=df["TX_FRAUD"]

X_train,X_test,y_train,y_test=train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

smote=SMOTE(random_state=42)
X_resample,y_resample=smote.fit_resample(X_train,y_train)
model=XGBClassifier(random_state=42)
model.fit(X_resample,y_resample)
y_pred=model.predict(X_test)

cm=confusion_matrix(y_test,y_pred)
print(cm)
cr=classification_report(y_test,y_pred)
print(cr)
ra=roc_auc_score(y_test,y_pred)
print(ra)

joblib.dump(model,"../XGmodel.pkl")











[[301231   2406]
 [  1367  45827]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    303637
           1       0.95      0.97      0.96     47194

    accuracy                           0.99    350831
   macro avg       0.97      0.98      0.98    350831
weighted avg       0.99      0.99      0.99    350831

0.9815552590217353


['../XGmodel.pkl']