<a href="https://colab.research.google.com/github/amactribouy99/smart_tax_zra/blob/main/Untitled9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
df = pd.read_csv('/content/synthetic_fraud_dataset.csv')
df = pd.read_csv('/content/synthetic_fraud_dataset.csv', parse_dates=['Timestamp'])

df.head()
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   Transaction_ID                50000 non-null  object        
 1   User_ID                       50000 non-null  object        
 2   Transaction_Amount            50000 non-null  float64       
 3   Transaction_Type              50000 non-null  object        
 4   Timestamp                     50000 non-null  datetime64[ns]
 5   Account_Balance               50000 non-null  float64       
 6   Device_Type                   50000 non-null  object        
 7   Location                      50000 non-null  object        
 8   Merchant_Category             50000 non-null  object        
 9   IP_Address_Flag               50000 non-null  int64         
 10  Previous_Fraudulent_Activity  50000 non-null  int64         
 11  Daily_Transaction_Count     

Unnamed: 0,Transaction_Amount,Timestamp,Account_Balance,IP_Address_Flag,Previous_Fraudulent_Activity,Daily_Transaction_Count,Avg_Transaction_Amount_7d,Failed_Transaction_Count_7d,Card_Age,Transaction_Distance,Risk_Score,Is_Weekend,Fraud_Label
count,50000.0,50000,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,99.411012,2023-07-02 12:47:11.063999744,50294.065981,0.0502,0.0984,7.48524,255.271924,2.00354,119.99994,2499.164155,0.501556,0.29964,0.32134
min,0.0,2023-01-01 00:01:00,500.48,0.0,0.0,1.0,10.0,0.0,1.0,0.25,0.0001,0.0,0.0
25%,28.6775,2023-04-02 03:35:30,25355.995,0.0,0.0,4.0,132.0875,1.0,60.0,1256.4975,0.254,0.0,0.0
50%,69.66,2023-07-02 14:00:00,50384.43,0.0,0.0,7.0,256.085,2.0,120.0,2490.785,0.50225,0.0,0.0
75%,138.8525,2023-10-01 07:13:00,75115.135,0.0,0.0,11.0,378.0325,3.0,180.0,3746.395,0.749525,1.0,1.0
max,1174.14,2023-12-31 23:50:00,99998.31,1.0,1.0,14.0,500.0,4.0,239.0,4999.93,1.0,1.0,1.0
std,98.687292,,28760.458557,0.21836,0.297858,4.039637,141.382279,1.414273,68.985817,1442.013834,0.287774,0.458105,0.466996


In [2]:
df.isnull().sum()

Unnamed: 0,0
Transaction_ID,0
User_ID,0
Transaction_Amount,0
Transaction_Type,0
Timestamp,0
Account_Balance,0
Device_Type,0
Location,0
Merchant_Category,0
IP_Address_Flag,0


In [3]:
df.duplicated().sum()

np.int64(0)

In [7]:
# Feature Engineering
# ------------------------------

# Extract time-based features
df['hour'] = df['Timestamp'].dt.hour
df['day_of_week'] = df['Timestamp'].dt.dayofweek
df['month'] = df['Timestamp'].dt.month

In [8]:
# Log-transform highly skewed numeric features
df['log_transaction_amount'] = np.log1p(df['Transaction_Amount'])
df['log_transaction_distance'] = np.log1p(df['Transaction_Distance'])
df['log_account_balance'] = np.log1p(df['Account_Balance'])

In [9]:
# Target variable
y = df['Fraud_Label']

In [10]:
# Features: drop identifiers & raw Timestamp
X = df.drop(columns=['Transaction_ID', 'User_ID', 'Timestamp', 'Fraud_Label'])

In [11]:
# Separate categorical and numeric

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [13]:
# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

In [44]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=['Transaction_ID', 'User_ID', 'Fraud_Label']), df['Fraud_Label'], test_size=0.3, random_state=42, stratify=df['Fraud_Label']
)

# Separate categorical and numeric columns (including the newly added engineered features)
# Re-calculate numeric and categorical columns based on X_train after splitting
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)]
).set_output(transform="pandas")


print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Categorical columns:", categorical_cols)
print("Numeric columns:", numeric_cols)

Train shape: (35000, 24)
Test shape: (15000, 24)
Categorical columns: ['Transaction_Type', 'Device_Type', 'Location', 'Merchant_Category', 'Card_Type', 'Authentication_Method']
Numeric columns: ['Transaction_Amount', 'Account_Balance', 'IP_Address_Flag', 'Previous_Fraudulent_Activity', 'Daily_Transaction_Count', 'Avg_Transaction_Amount_7d', 'Failed_Transaction_Count_7d', 'Card_Age', 'Transaction_Distance', 'Risk_Score', 'Is_Weekend', 'log_transaction_amount', 'log_transaction_distance', 'log_account_balance']


In [29]:
# Helper function for evaluation
# ------------------------------
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import pandas as pd

def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                               ("model", model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:,1] if hasattr(pipeline, "predict_proba") else None

    print(f"\n===== {name} =====")
    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall   :", recall_score(y_test, y_pred))
    print("F1-score :", f1_score(y_test, y_pred))
    if y_prob is not None:
        print("ROC-AUC  :", roc_auc_score(y_test, y_prob))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [19]:
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

log_reg = LogisticRegression(max_iter=500, class_weight="balanced", solver="liblinear")
rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight="balanced")
xgb_clf = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    scale_pos_weight=1  # adjust if fraud is imbalanced
)

In [21]:
# Train & Evaluate
evaluate_model("Logistic Regression", log_reg, X_train, y_train, X_test, y_test)
evaluate_model("Random Forest", rf, X_train, y_train, X_test, y_test)
evaluate_model("XGBoost", xgb_clf, X_train, y_train, X_test, y_test)


===== Logistic Regression =====
Accuracy : 0.7960666666666667
Precision: 0.6467255457423763
Recall   : 0.8051867219917013
F1-score : 0.7173089363275114
ROC-AUC  : 0.894196435122158

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.79      0.84     10180
           1       0.65      0.81      0.72      4820

    accuracy                           0.80     15000
   macro avg       0.77      0.80      0.78     15000
weighted avg       0.82      0.80      0.80     15000


===== Random Forest =====
Accuracy : 1.0
Precision: 1.0
Recall   : 1.0
F1-score : 1.0
ROC-AUC  : 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     10180
           1       1.00      1.00      1.00      4820

    accuracy                           1.00     15000
   macro avg       1.00      1.00      1.00     15000
weighted avg       1.00      1.00      1.00     15000


===== XGBoo

In [25]:
# Random Forest Hyperparameter Tuning
# ------------------------------
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestClassifier(random_state=42, class_weight="balanced")

rf_param_dist = {
    "model__n_estimators": randint(100, 500),
    "model__max_depth": randint(3, 20),
    "model__min_samples_split": randint(2, 20),
    "model__min_samples_leaf": randint(1, 10),
    "model__max_features": ["sqrt", "log2", None]
}
rf_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", rf)])

rf_search = RandomizedSearchCV(
    rf_pipeline,
    param_distributions=rf_param_dist,
    n_iter=20,
    cv=3,
    scoring="roc_auc",
    verbose=2,
    random_state=42,
    n_jobs=-1
)

rf_search.fit(X_train, y_train)
print("Best RF Params:", rf_search.best_params_)
rf_best = rf_search.best_estimator_

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best RF Params: {'model__max_depth': 9, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 8, 'model__min_samples_split': 8, 'model__n_estimators': 221}


In [32]:
# XGBoost Hyperparameter Tuning
# ------------------------------
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV

xgb_clf = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    random_state=42,
    use_label_encoder=False
)

xgb_param_dist = {
    "model__n_estimators": randint(200, 600),
    "model__max_depth": randint(3, 12),
    "model__learning_rate": uniform(0.01, 0.3),
    "model__subsample": uniform(0.6, 0.4),
    "model__colsample_bytree": uniform(0.6, 0.4),
    "model__gamma": uniform(0, 5),
    "model__min_child_weight": randint(1, 10)
}

xgb_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", xgb_clf)])

xgb_search = RandomizedSearchCV(
    xgb_pipeline,
    param_distributions=xgb_param_dist,
    n_iter=25,
    cv=3,
    scoring="roc_auc",
    verbose=2,
    random_state=42,
    n_jobs=-1
)

xgb_search.fit(X_train, y_train)
print("Best XGB Params:", xgb_search.best_params_)
xgb_best = xgb_search.best_estimator_

Fitting 3 folds for each of 25 candidates, totalling 75 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best XGB Params: {'model__colsample_bytree': np.float64(0.7727780074568463), 'model__gamma': np.float64(1.4561457009902097), 'model__learning_rate': np.float64(0.19355586841671385), 'model__max_depth': 5, 'model__min_child_weight': 7, 'model__n_estimators': 443, 'model__subsample': np.float64(0.836965827544817)}
