In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFECV, mutual_info_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:
#load smote_train_data for training data and transformed_data for evaluation
smote_train_data_folder_path = '/Users/joanwong/Desktop/cs3244/finalised_datasets/smote_train_data'
transformed_data_folder_path = '/Users/joanwong/Desktop/cs3244/finalised_datasets/transformed_data'

smote_files = ['smote_fold_1.csv', 'smote_fold_2.csv', 'smote_fold_3.csv', 'smote_fold_4.csv', 'smote_fold_5.csv']
non_smote_files = ['fold_1.csv', 'fold_2.csv','fold_3.csv', 'fold_4.csv', 'fold_5.csv']


testing_data = pd.read_csv('/Users/joanwong/Desktop/cs3244/finalised_datasets/transformed_data/test_set.csv')
testing_data = testing_data.drop(columns = ['client_id', 'creation_date'])

In [None]:
# Custom cross-validation using RFECV for regression
feature_counts_rfecv = []
selected_features_rfecv_all_folds = []
feature_rankings_rfecv = []

for i in range(len(smote_files)):
    # Define training and testing sets
    train_smote_files = [file for j, file in enumerate(smote_files) if j != i]  # All folds except the i-th for training
    test_non_smote_file = non_smote_files[i]  # The corresponding non-smoted fold for testing

    # Load training data
    train_dataframes = []
    for file in train_smote_files:
        file_path = os.path.join(smote_train_data_folder_path, file)
        df = pd.read_csv(file_path)
        train_dataframes.append(df)

    # Concatenate training data
    train_data = pd.concat(train_dataframes, ignore_index=True)

    # Load testing data
    test_file_path = os.path.join(smote_train_data_folder_path, test_non_smote_file)
    test_data = pd.read_csv(test_file_path)

    # Split features and target
    features = train_data.columns.drop('fraud_status')
    X_train = train_data[features]
    y_train = train_data['fraud_status']
    X_test = test_data[features]
    y_test = test_data['fraud_status']

    # Step 1: Feature Selection using RFECV
    print(f"Fold {i+1}: Performing RFECV for feature selection...")
    model = DecisionTreeRegressor(random_state=42)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    rfecv = RFECV(estimator=model, step=2, cv=cv, scoring='neg_root_mean_squared_error', n_jobs=-1)
    rfecv.fit(X_train, y_train)

    # Select the optimal features using RFECV
    selected_features_rfecv = X_train.columns[rfecv.support_]
    print(f"Number of features selected by RFECV for Fold {i+1}: {len(selected_features_rfecv)}")
    print("Selected features using RFECV:", selected_features_rfecv)
    feature_counts_rfecv.append(len(selected_features_rfecv))
    selected_features_rfecv_all_folds.append(selected_features_rfecv)
    feature_rankings_rfecv.append(pd.Series(rfecv.ranking_, index=X_train.columns))

    # Plot the number of features vs cross-validation score
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, marker='o')
    plt.xlabel('Number of Features')
    plt.ylabel('Cross-Validation Score (Neg RMSE)')
    plt.title(f'RFECV: Optimal Number of Features for Fold {i + 1}')
    plt.grid(True)
    plt.show()

In [None]:
# Aggregated Feature Importance for RFECV
avg_rankings_rfecv = pd.concat(feature_rankings_rfecv, axis=1).mean(axis=1).sort_values()
print("\nAggregated Feature Importance (RFECV):")
print(avg_rankings_rfecv)

# Determine Optimal Features using Aggregated Feature Importance (RFECV)
rmse_scores_rfecv = []
threshold = 0.001  # Set a threshold for what counts as "sufficiently close to 0"
last_rmse_score = None
optimal_num_features_rfecv = None

# Loop through increasing numbers of features
for i in range(1, len(avg_rankings_rfecv) + 1):
    selected_features = avg_rankings_rfecv.head(i).index
    X_train_selected = X_train[selected_features]

    # Perform cross-validation with the selected features
    model = DecisionTreeRegressor(random_state=42)
    rmse = -cross_val_score(model, X_train_selected, y_train, cv=3, scoring='neg_root_mean_squared_error').mean()
    rmse_scores_rfecv.append(rmse)

    # Check if the difference is sufficiently small
    if last_rmse_score is not None:
        difference = abs(rmse - last_rmse_score)
        if difference <= threshold:
            optimal_num_features_rfecv = i
            break

    last_rmse_score = rmse

# Plot the RMSE scores vs number of features (show the entire range)
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(rmse_scores_rfecv) + 1), rmse_scores_rfecv, marker='o')
plt.xlabel('Number of Features')
plt.ylabel('Cross-Validation RMSE')
plt.title('RMSE Scores vs Number of Features (RFECV)')
plt.grid()

# Mark the optimal number of features with a red dotted line
if optimal_num_features_rfecv is not None:
    plt.axvline(x=optimal_num_features_rfecv, color='r', linestyle='--', label=f'Optimal Features: {optimal_num_features_rfecv}')
plt.legend()
plt.show()

# Print the optimal number of features and the selected features
if optimal_num_features_rfecv is not None:
    print(f"Optimal number of features where change is sufficiently close to 0 (RFECV): {optimal_num_features_rfecv}")
    selected_features_rfecv_final = avg_rankings_rfecv.head(optimal_num_features_rfecv).index
    print(f"Selected features based on optimal number (RFECV): {list(selected_features_rfecv_final)}")
else:
    print("No plateau detected within the set threshold for RFECV.")

In [None]:
# Custom cross-validation using Mutual Information for regression
avg_mutual_info = []
feature_counts_mi = []
selected_features_mi_all_folds = []

for i in range(len(smote_files)):
    # Define training and testing sets
    train_smote_files = [file for j, file in enumerate(smote_files) if j != i]  # All folds except the i-th for training
    test_non_smote_file = non_smote_files[i]  # The corresponding non-smoted fold for testing

    # Load training data
    train_dataframes = []
    for file in train_smote_files:
        file_path = os.path.join(smote_train_data_folder_path, file)
        df = pd.read_csv(file_path)
        train_dataframes.append(df)

    # Concatenate training data
    train_data = pd.concat(train_dataframes, ignore_index=True)

    # Load testing data
    test_file_path = os.path.join(smote_train_data_folder_path, test_non_smote_file)
    test_data = pd.read_csv(test_file_path)

    # Split features and target
    features = train_data.columns.drop('fraud_status')
    X_train = train_data[features]
    y_train = train_data['fraud_status']
    X_test = test_data[features]
    y_test = test_data['fraud_status']

    # Step 1: Feature Selection using Mutual Information
    print(f"Fold {i+1}: Performing Mutual Information for feature selection...")
    mutual_info = mutual_info_regression(X_train, y_train)
    mutual_info_series = pd.Series(mutual_info, index=X_train.columns).sort_values(ascending=False)
    avg_mutual_info.append(mutual_info_series)
    scores = []

    # Determine optimal number of features using cross-validation
    best_score = -np.inf
    optimal_k = 0
    for k in range(1, len(mutual_info_series) + 1):
        selected_features = mutual_info_series.head(k).index
        X_train_selected = X_train[selected_features]
        score = cross_val_score(DecisionTreeRegressor(random_state=42), X_train_selected, y_train, cv=3, scoring='neg_root_mean_squared_error').mean()
        scores.append(score)

        if score > best_score:
            best_score = score
            optimal_k = k

    print(f"Optimal number of features based on cross-validation for Fold {i + 1}: {optimal_k}")
    selected_features_mi_optimal = mutual_info_series.head(optimal_k).index
    print("Selected features using Mutual Information:", selected_features_mi_optimal)
    feature_counts_mi.append(len(selected_features_mi_optimal))
    selected_features_mi_all_folds.append(selected_features_mi_optimal)

    # Plot the cross-validation scores vs. number of features
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, len(mutual_info_series) + 1), scores, marker='o')
    plt.xlabel('Number of Features')
    plt.ylabel('Cross-Validation Score (Neg RMSE)')
    plt.title(f'Mutual Information: Optimal Number of Features for Fold {i + 1}')
    plt.grid(True)
    plt.show()


In [None]:
# Aggregated Feature Importance for Mutual Information
avg_mutual_info_df = pd.concat(avg_mutual_info, axis=1).mean(axis=1).sort_values(ascending=False)
print("\nAggregated Feature Importance (Mutual Information):")
print(avg_mutual_info_df)

# Determine Optimal Features using Aggregated Feature Importance (Mutual Information)
rmse_scores = []
threshold = 0.001  # Set a threshold for what counts as "sufficiently close to 0"
last_rmse_score = None
optimal_num_features = None

# Loop through increasing numbers of features
for i in range(1, len(avg_mutual_info_df) + 1):
    selected_features = avg_mutual_info_df.head(i).index
    X_train_selected = X_train[selected_features]

    # Perform cross-validation with the selected features
    model = DecisionTreeRegressor(random_state=42)
    rmse = -cross_val_score(model, X_train_selected, y_train, cv=3, scoring='neg_root_mean_squared_error').mean()
    rmse_scores.append(rmse)

    # Check if the difference is sufficiently small
    if last_rmse_score is not None:
        difference = abs(rmse - last_rmse_score)
        if difference <= threshold:
            optimal_num_features = i
            break

    last_rmse_score = rmse

# Plot the RMSE scores vs number of features (show the entire range)
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(rmse_scores) + 1), rmse_scores, marker='o')
plt.xlabel('Number of Features')
plt.ylabel('Cross-Validation RMSE')
plt.title('RMSE Scores vs Number of Features')
plt.grid()

# Mark the optimal number of features with a red dotted line
if optimal_num_features is not None:
    plt.axvline(x=optimal_num_features, color='r', linestyle='--', label=f'Optimal Features: {optimal_num_features}')
plt.legend()
plt.show()

# Print the optimal number of features and the selected features
if optimal_num_features is not None:
    print(f"Optimal number of features where change is sufficiently close to 0: {optimal_num_features}")
    selected_features_final = avg_mutual_info_df.head(optimal_num_features).index
    print(f"Selected features based on optimal number: {list(selected_features_final)}")
else:
    print("No plateau detected within the set threshold.")


In [None]:
# Load the complete training and testing datasets
train_data = pd.concat([pd.read_csv(os.path.join(smote_train_data_folder_path, file)) for file in smote_files], ignore_index=True)
test_data = pd.read_csv(os.path.join(transformed_data_folder_path, 'test_set.csv'))

# Split features and target for training and testing
features = train_data.columns.drop('fraud_status')
X_train_full = train_data[features]
y_train_full = train_data['fraud_status']
X_test_full = test_data[features]
y_test_full = test_data['fraud_status']


# Create a sample with a stratified split to maintain class balance
sample_fraction = 0.1
X_sample, X_sample_test, y_sample, y_sample_test = train_test_split(X_train_full, y_train_full, test_size=(1 - sample_fraction), stratify=y_train_full, random_state=42)

In [None]:
# Step 7.1: Use Optuna to choose the best feature selection method (RFECV or Mutual Information)
def feature_selection_objective(trial):
    feature_method = trial.suggest_categorical('feature_method', ['RFECV', 'Mutual Information'])
    
    # Use the full dataset for training and testing
    if feature_method == 'RFECV':
        features = selected_features_rfecv_all_folds[-1]
    else:
        features = selected_features_mi_all_folds[-1]

    X_train_selected = X_train[features]
    y_train_selected = y_train

    # Train and evaluate model (DecisionTreeRegressor for consistency)
    model = DecisionTreeRegressor(random_state=42)
    rmse = -cross_val_score(model, X_train_selected, y_train_selected, cv=3, scoring='neg_root_mean_squared_error').mean()

    return rmse

# Run Optuna to find the best feature selection method
feature_study = optuna.create_study(direction='minimize')
feature_study.optimize(feature_selection_objective, n_trials=10)

best_feature_method = feature_study.best_trial.params['feature_method']
print("Best feature selection method:", best_feature_method)

In [None]:
# Step 7.2: Use Optuna to choose the best ensemble model (Bagging/Random Forest/XGBoost/AdaBoost/LightGBM)
def model_selection_objective(trial):
    # Use the best feature selection method obtained from Step 7.1
    features = selected_features_rfecv_all_folds[-1] if best_feature_method == 'RFECV' else selected_features_mi_all_folds[-1]

    X_train_selected = X_train[features]
    y_train_selected = y_train

    # Choose the ensemble model type and hyperparameters
    model_name = trial.suggest_categorical('model_name', ['Bagging', 'RandomForest', 'AdaBoost', 'XGBoost', 'LightGBM'])
    
    if model_name == 'Bagging':
        n_estimators = trial.suggest_int('n_estimators', 10, 100, step=10)
        max_samples = trial.suggest_float('max_samples', 0.5, 1.0, step=0.1)
        model = BaggingRegressor(estimator=DecisionTreeRegressor(), n_estimators=n_estimators, max_samples=max_samples, random_state=42)
    elif model_name == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 50, 150, step=50)
        max_depth = trial.suggest_int('max_depth', 10, 30, step=10)
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    elif model_name == 'AdaBoost':
        n_estimators = trial.suggest_int('n_estimators', 50, 150, step=50)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 1.0, log=True)
        model = AdaBoostRegressor(estimator=DecisionTreeRegressor(), n_estimators=n_estimators, learning_rate=learning_rate, random_state=42)
    elif model_name == 'XGBoost':
        n_estimators = trial.suggest_int('n_estimators', 50, 150, step=50)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.2, log=True)
        max_depth = trial.suggest_int('max_depth', 3, 9, step=3)
        model = XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=42)
    elif model_name == 'LightGBM':
        n_estimators = trial.suggest_int('n_estimators', 50, 150, step=50)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.2, log=True)
        num_leaves = trial.suggest_int('num_leaves', 31, 100, step=10)
        model = LGBMRegressor(n_estimators=n_estimators, learning_rate=learning_rate, num_leaves=num_leaves, random_state=42)

    rmse = -cross_val_score(model, X_train_selected, y_train_selected, cv=3, scoring='neg_root_mean_squared_error').mean()

    return rmse

# Run Optuna to find the best ensemble model
model_study = optuna.create_study(direction='minimize')
model_study.optimize(model_selection_objective, n_trials=20)

print("Best model selection trial:")
print(model_study.best_trial)

In [None]:
# Use the best parameters to initialize and train the model
model_name = best_params['model_name']

if model_name == 'Bagging':
    best_model = BaggingRegressor(
        estimator=DecisionTreeRegressor(),
        n_estimators=best_params['n_estimators'],
        max_samples=best_params['max_samples'],
        random_state=42
    )
elif model_name == 'RandomForest':
    best_model = RandomForestRegressor(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        random_state=42
    )
elif model_name == 'AdaBoost':
    best_model = AdaBoostRegressor(
        estimator=DecisionTreeRegressor(),
        n_estimators=best_params['n_estimators'],
        learning_rate=best_params['learning_rate'],
        random_state=42
    )
elif model_name == 'XGBoost':
    best_model = XGBRegressor(
        n_estimators=best_params['n_estimators'],
        learning_rate=best_params['learning_rate'],
        max_depth=best_params['max_depth'],
        random_state=42,
        eval_metric='rmse'
    )
elif model_name == 'LightGBM':
    best_model = LGBMRegressor(
        n_estimators=best_params['n_estimators'],
        learning_rate=best_params['learning_rate'],
        num_leaves=best_params['num_leaves'],
        random_state=42
    )

# Train the model
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test_final_selected)

# Calculate evaluation metrics
rmse = np.sqrt(mean_squared_error(y_test_final, y_pred))
mae = mean_absolute_error(y_test_final, y_pred)
r2 = r2_score(y_test_final, y_pred)

print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R2): {r2:.4f}")

# If evaluating as a classification report for threshold-based results, use classification metrics
# You can use a threshold to transform regression outputs to a binary classification (e.g., y_pred_class = y_pred > 0.5)
threshold = 0.5
y_pred_class = (y_pred > threshold).astype(int)

print("\nClassification Report (using threshold for binary output):")
print(classification_report(y_test_final, y_pred_class))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_final, y_pred_class))