In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [11]:
df = pd.read_csv('C:/Users/Abdilfatah/Desktop/Data/week6/model-data.csv')

In [12]:
df.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CountryCode,ProviderId,ProductId,ChannelId,Amount,...,Recency,Frequency,Monetary,Stability,Recency_WoE,Frequency_WoE,Monetary_WoE,Stability_WoE,predicted_prob,predicted_label
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,256,ProviderId_6,ProductId_10,ChannelId_3,-0.046371,...,90,119,-5.591789,0.024673,-1.886379,13.48938,12.390767,12.390767,0.004493,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,256,ProviderId_4,ProductId_6,ChannelId_2,-0.054643,...,90,119,-5.591789,0.024673,-1.886379,13.48938,12.390767,12.390767,0.004493,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,256,ProviderId_6,ProductId_1,ChannelId_3,-0.050426,...,90,2,-0.100852,0.0,-1.886379,-1.608747,11.195163,-1.57498,0.000258,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,256,ProviderId_1,ProductId_21,ChannelId_3,0.107717,...,90,38,-0.215326,0.139241,-1.886379,13.04099,11.24923,11.24923,0.00731,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,256,ProviderId_4,ProductId_6,ChannelId_2,-0.059704,...,90,38,-0.215326,0.139241,-1.886379,13.04099,11.24923,11.24923,0.00731,0


In [13]:
df = df.apply(pd.to_numeric, errors='coerce')

In [18]:
from sklearn.model_selection import train_test_split
rfms_features = ['Recency', 'Frequency', 'Monetary', 'Stability']
# Features (X) and Target (y)
X = df[[feature + '_WoE' for feature in rfms_features]]
y = df['FraudResult']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Initialize the model
dt_model = DecisionTreeClassifier(random_state=42)
# Train the model
dt_model.fit(X_train, y_train)

# Initialize the model
logreg_model = LogisticRegression(random_state=42)
# Train the model
logreg_model.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

# Logistic Regression
logreg_params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}
logreg_grid = GridSearchCV(LogisticRegression(random_state=42), logreg_params, cv=5, scoring='roc_auc')
logreg_grid.fit(X_train, y_train)

# Decision Tree
dt_params = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10]
}
dt_grid = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_params, cv=5, scoring='roc_auc')
dt_grid.fit(X_train, y_train)

# Random Forest
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='roc_auc')
rf_grid.fit(X_train, y_train)

# Gradient Boosting Machine (GBM)
gbm_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7]
}
gbm_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), gbm_params, cv=5, scoring='roc_auc')
gbm_grid.fit(X_train, y_train)

# Best parameters for each model
print("Logistic Regression - Best Parameters:", logreg_grid.best_params_)
print("Decision Tree - Best Parameters:", dt_grid.best_params_)
print("Random Forest - Best Parameters:", rf_grid.best_params_)
print("Gradient Boosting Machine - Best Parameters:", gbm_grid.best_params_)

# Use best models for prediction
logreg_model = logreg_grid.best_estimator_
dt_model = dt_grid.best_estimator_
rf_model = rf_grid.best_estimator_
gbm_model = gbm_grid.best_estimator_

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

models = {
    'Logistic Regression': logreg_model,
    'Decision Tree': dt_model,
    'Random Forest': rf_model,
    'Gradient Boosting Machine': gbm_model
}

# Evaluate each model
for model_name, model in models.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    print(f"Model: {model_name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))
    print("\n")