In [17]:
import os
import sys
# Add the scripts folder to the Python path
sys.path.append(os.path.abspath("../src"))

In [57]:
# Import functions from the model_training script
from model_training import preprocess_target_column, split_data, train_logistic_regression, train_random_forest, evaluate_model
from feature_selection import select_relevant_features
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix



In [19]:
#Load the processed data
processed_data_path = "D:/Kifya_training/Week 6/Bati-Bank-Credit-Scoring-Model/data/processed/processed_data.csv"
df = pd.read_csv(processed_data_path)

In [38]:
# Define the target column
target_column = "FraudResult"

In [39]:
# Select relevant features based on correlation
relevant_features = select_relevant_features(df, target_column=target_column, correlation_threshold=0.05)
print("Selected Relevant Features:", relevant_features)

Selected Relevant Features: ['Amount', 'Value', 'TotalTransactionAmount', 'AverageTransactionAmount']


In [40]:
# Filter the dataset to include only relevant features and the target column
df = df[relevant_features + [target_column]]

In [41]:
# Split the data
X_train, X_test, y_train, y_test = split_data(df, target_column)

In [42]:
# Train Logistic Regression
log_model = train_logistic_regression(X_train, y_train)
print("Logistic Regression Model Trained.")


Logistic Regression Model Trained.


In [43]:
# Train Random Forest
rf_model = train_random_forest(X_train, y_train)
print("Random Forest Model Trained.")

Random Forest Model Trained.


In [44]:
# Evaluate Logistic Regression
log_metrics = evaluate_model(log_model, X_test, y_test)
print("Logistic Regression Metrics:")
for metric, value in log_metrics.items():
    print(f"{metric}: {value:.4f}")


Logistic Regression Metrics:
accuracy: 1.0000
precision: 1.0000
recall: 1.0000
f1_score: 1.0000
roc_auc: 1.0000


In [45]:
# Evaluate Random Forest
rf_metrics = evaluate_model(rf_model, X_test, y_test)
print("\nRandom Forest Metrics:")
for metric, value in rf_metrics.items():
    print(f"{metric}: {value:.4f}")


Random Forest Metrics:
accuracy: 1.0000
precision: 1.0000
recall: 1.0000
f1_score: 1.0000
roc_auc: 1.0000


In [46]:
# Verify the correlation of selected features with the target
for feature in relevant_features:
    correlation = df[[feature, target_column]].corr().iloc[0, 1]
    print(f"Feature: {feature}, Correlation with Target: {correlation:.4f}")


Feature: Amount, Correlation with Target: 0.5574
Feature: Value, Correlation with Target: 0.5667
Feature: TotalTransactionAmount, Correlation with Target: 0.0635
Feature: AverageTransactionAmount, Correlation with Target: 0.3390


In [47]:
# Check for overlap in feature values
for feature in relevant_features:
    overlap = set(X_train[feature]).intersection(set(X_test[feature]))
    print(f"Feature: {feature}, Overlap: {len(overlap)} values")


Feature: Amount, Overlap: 475 values
Feature: Value, Overlap: 463 values
Feature: TotalTransactionAmount, Overlap: 1643 values
Feature: AverageTransactionAmount, Overlap: 1797 values


In [48]:

# Compute the confusion matrix for Logistic Regression
log_y_pred = log_model.predict(X_test)
print("Confusion Matrix for Logistic Regression:")
print(confusion_matrix(y_test, log_y_pred))

# Compute the confusion matrix for Random Forest
rf_y_pred = rf_model.predict(X_test)
print("\nConfusion Matrix for Random Forest:")
print(confusion_matrix(y_test, rf_y_pred))


Confusion Matrix for Logistic Regression:
[[19097     0]
 [    0    36]]

Confusion Matrix for Random Forest:
[[19097     0]
 [    0    36]]


In [49]:
# Remove highly correlated features
filtered_features = ["TotalTransactionAmount", "AverageTransactionAmount"]  # Retain less correlated features
X_train = X_train[filtered_features]
X_test = X_test[filtered_features]


In [50]:
# Re-train Logistic Regression
log_model = train_logistic_regression(X_train, y_train)
log_metrics = evaluate_model(log_model, X_test, y_test)
print("Logistic Regression Metrics after feature refinement:", log_metrics)

# Re-train Random Forest
rf_model = train_random_forest(X_train, y_train)
rf_metrics = evaluate_model(rf_model, X_test, y_test)
print("Random Forest Metrics after feature refinement:", rf_metrics)


Logistic Regression Metrics after feature refinement: {'accuracy': 0.9983274969947211, 'precision': 1.0, 'recall': 0.1111111111111111, 'f1_score': 0.2, 'roc_auc': np.float64(0.9938959289708098)}
Random Forest Metrics after feature refinement: {'accuracy': 0.9981706998379762, 'precision': 0.5333333333333333, 'recall': 0.2222222222222222, 'f1_score': 0.3137254901960784, 'roc_auc': np.float64(0.9684512692511331)}


In [None]:
# Apply SMOTE to balance the training dataset
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("After SMOTE - Class Distribution:")
print(pd.Series(y_train_balanced).value_counts())


After SMOTE - Class Distribution:
FraudResult
0    76372
1    76372
Name: count, dtype: int64


In [None]:


# Define parameter grid
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

# Perform grid search
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42, class_weight="balanced"),
    param_grid,
    scoring="f1",
    cv=3,
    verbose=2,
    n_jobs=-1,
)
grid_search.fit(X_train_balanced, y_train_balanced)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Evaluate the best model
best_rf_model = grid_search.best_estimator_
rf_metrics = evaluate_model(best_rf_model, X_test, y_test)
print("Tuned Random Forest Metrics:", rf_metrics)


Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Tuned Random Forest Metrics: {'accuracy': 0.9881356818063033, 'precision': 0.12549019607843137, 'recall': 0.8888888888888888, 'f1_score': 0.21993127147766323, 'roc_auc': np.float64(0.9685530886177585)}
