#### Machine Learning Model Building

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from imblearn.metrics import geometric_mean_score 

# Load the FINAL enriched dataset
df = pd.read_csv('safaricom_ethiopia_dfs_enriched_data_V10_final.csv')

In [17]:
# We use all assurance, regional, and engineered features
FEATURES = [
    'Amount_ETB', 'System_Fee_ETB', 'Is_Cross_Region',
    'Sender_Region', 'Receiver_Region', 'Transaction_Type',
    'Billing_System_Status',
    'Txn_Count_Sender_1H', 'Agent_Cust_Pair_Count_7D', 'Is_Night_Time_Txn'
]
TARGET = 'Is_Fraud'

X = df[FEATURES].copy()
y = df[TARGET]

In [18]:
# 2. One-Hot Encoding for Categorical Variables
# Use pandas get_dummies to encode categorical columns
X = pd.get_dummies(X, columns=['Sender_Region', 'Receiver_Region', 'Transaction_Type', 'Billing_System_Status'], drop_first=True)

In [19]:
# 3. Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y  # ESSENTIAL: Ensures both sets have the same fraud ratio
)

In [20]:
# 4. Scaling Numerical Features
# Scale only the continuous numerical features
numerical_cols = ['Amount_ETB', 'System_Fee_ETB', 'Txn_Count_Sender_1H', 'Agent_Cust_Pair_Count_7D']
scaler = StandardScaler()

# Fit scaler only on the training data and transform both sets
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

print("Data Preparation Complete ")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Data Preparation Complete 
Training set shape: (70000, 17)
Test set shape: (30000, 17)


In [21]:
# 1. Initialize and Train Logistic Regression Model
from sklearn.linear_model import LogisticRegression
# Assuming X_train, y_train, X_test, y_test are in memory from the previous step
# from imblearn.metrics import geometric_mean_score 
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import pandas as pd
import numpy as np
from imblearn.metrics import geometric_mean_score 

model = LogisticRegression(
    solver='liblinear',
    random_state=42,
    class_weight='balanced'  # IMPORTANT: Automatically applies higher weight to the fraud class
)

print("\n Model Training Started")
model.fit(X_train, y_train)
print("Model Training Complete")


 Model Training Started
Model Training Complete


In [22]:
# 2. Predict and Evaluate
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1] # Probability of being the positive class (Fraud)

In [23]:
# 3. Performance Metrics (Focusing on the Minority Class)
print("Model Performance Metrics")

# A. Confusion Matrix (Directly shows True Positives/Negatives)
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

print(f"\nConfusion Matrix:\n{cm}")
print(f"True Positives (Fraud Correctly Caught, TP): {tp}")
print(f"False Positives (Good Txns Flagged as Fraud, FP): {fp}")
print(f"False Negatives (Fraud Missed, FN): {fn} <--- CRITICAL")
print(f"True Negatives (Good Txns Correctly Allowed, TN): {tn}")

# B. Classification Report (Precision, Recall, F1-Score)
print("\nClassification Report (Focus on 'Fraud' row):")
# Note: Ensure the target names match the order of classes (0=Not Fraud, 1=Fraud)
print(classification_report(y_test, y_pred, target_names=['Not Fraud', 'Fraud']))

# C. Summary Metrics
auc_roc = roc_auc_score(y_test, y_proba)
gmean = geometric_mean_score(y_test, y_pred) # G-Mean: Balance of performance across both classes

print(f"AUC-ROC Score (Overall Discriminative Power): {auc_roc:.4f}")
print(f"Geometric Mean (Balance of Class Accuracy): {gmean:.4f}")

Model Performance Metrics

Confusion Matrix:
[[27556  2203]
 [    3   238]]
True Positives (Fraud Correctly Caught, TP): 238
False Positives (Good Txns Flagged as Fraud, FP): 2203
False Negatives (Fraud Missed, FN): 3 <--- CRITICAL
True Negatives (Good Txns Correctly Allowed, TN): 27556

Classification Report (Focus on 'Fraud' row):
              precision    recall  f1-score   support

   Not Fraud       1.00      0.93      0.96     29759
       Fraud       0.10      0.99      0.18       241

    accuracy                           0.93     30000
   macro avg       0.55      0.96      0.57     30000
weighted avg       0.99      0.93      0.96     30000

AUC-ROC Score (Overall Discriminative Power): 0.9805
Geometric Mean (Balance of Class Accuracy): 0.9563


In [24]:
# Extract feature importance (coefficients from the Logistic Regression model)
coefficients = pd.Series(model.coef_[0], index=X_train.columns)

# Convert coefficients to Odds Ratio (exp(coefficient))
# An Odds Ratio of 2.0 means the odds of fraud double when this feature increases by one unit (or is present).
odds_ratios = np.exp(coefficients).sort_values(ascending=False)

print(" Actionable Model Interpretation (Odds Ratio)")
print("Feature Odds Ratio (Impact on Fraud Likelihood):\n(Higher ratio means a greater increase in the odds of being fraud)\n")
print(odds_ratios.head(15).round(3))

 Actionable Model Interpretation (Odds Ratio)
Feature Odds Ratio (Impact on Fraud Likelihood):
(Higher ratio means a greater increase in the odds of being fraud)

Transaction_Type_P2P_Transfer        13249.662
Amount_ETB                               4.216
System_Fee_ETB                           4.149
Billing_System_Status_TIMEOUT            1.515
Transaction_Type_Agent_Cash_Out          1.427
Receiver_Region_Oromia                   1.175
Is_Cross_Region                          1.073
Txn_Count_Sender_1H                      1.056
Is_Night_Time_Txn                        0.973
Agent_Cust_Pair_Count_7D                 0.931
Receiver_Region_South Ethiopia           0.722
Sender_Region_South Ethiopia             0.025
Sender_Region_Oromia                     0.020
Transaction_Type_Airtime_Purchase        0.005
Transaction_Type_Bill_Payment            0.001
dtype: float64


#### Deployment and Monitoring

In [25]:
import joblib

# Define filenames
MODEL_FILENAME = 'logistic_regression_fraud_model.joblib'
SCALER_FILENAME = 'standard_scaler.jobjob'

# Save the trained model
joblib.dump(model, MODEL_FILENAME)
print(f"Model saved as: {MODEL_FILENAME}")

# Save the fitted scaler (essential for processing new data)
joblib.dump(scaler, SCALER_FILENAME)
print(f"Scaler saved as: {SCALER_FILENAME}")

# --- Simulation of Loading and Scoring New Data ---
print("\nSimulation: Scoring a New Transaction")
# Example of new data (a hypothetical high-risk P2P transaction in Addis Ababa)
new_data = pd.DataFrame({
    'Amount_ETB': [4800.0],
    'System_Fee_ETB': [96.0],
    'Is_Cross_Region': [0],
    'Sender_Region': ['Addis Ababa'],
    'Receiver_Region': ['Addis Ababa'],
    'Transaction_Type': ['P2P_Transfer'],
    'Billing_System_Status': ['SUCCESS'],
    'Txn_Count_Sender_1H': [5], # High velocity
    'Agent_Cust_Pair_Count_7D': [0],
    'Is_Night_Time_Txn': [1] # Night time
})

# 1. Apply One-Hot Encoding (must align with training columns)
new_X = pd.get_dummies(new_data, columns=['Sender_Region', 'Receiver_Region', 'Transaction_Type', 'Billing_System_Status'])

# 2. Re-align columns to match the training data (critical step)
missing_cols = set(X_train.columns) - set(new_X.columns)
for col in missing_cols:
    new_X[col] = 0
new_X = new_X[X_train.columns] # Ensure same order

# 3. Scale the numerical features
numerical_cols = ['Amount_ETB', 'System_Fee_ETB', 'Txn_Count_Sender_1H', 'Agent_Cust_Pair_Count_7D']
new_X[numerical_cols] = scaler.transform(new_X[numerical_cols])

# 4. Predict
risk_score = model.predict_proba(new_X)[:, 1][0]

print(f"\nHypothetical Transaction Features:")
print(new_data)
print(f"\nPredicted Fraud Risk Score (Probability): {risk_score:.4f}")
if risk_score > 0.5:
    print("Action: **ROUTE TO ANALYST REVIEW QUEUE**")
else:
    print("Action: ALLOW (Low Risk)")

Model saved as: logistic_regression_fraud_model.joblib
Scaler saved as: standard_scaler.jobjob

Simulation: Scoring a New Transaction

Hypothetical Transaction Features:
   Amount_ETB  System_Fee_ETB  Is_Cross_Region Sender_Region Receiver_Region  \
0      4800.0            96.0                0   Addis Ababa     Addis Ababa   

  Transaction_Type Billing_System_Status  Txn_Count_Sender_1H  \
0     P2P_Transfer               SUCCESS                    5   

   Agent_Cust_Pair_Count_7D  Is_Night_Time_Txn  
0                         0                  1  

Predicted Fraud Risk Score (Probability): 0.9992
Action: **ROUTE TO ANALYST REVIEW QUEUE**
