In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings 
warnings.filterwarnings('ignore')

In [3]:
# Load train and test data
train_df = pd.read_csv("../data/train_data.csv")
test_df = pd.read_csv("../data/test_data.csv")

In [4]:
# Assign `is_fraud` column as y
X_train = train_df.loc[:, train_df.columns != 'is_fraud']
y_train = train_df['is_fraud']

X_test = test_df.loc[:, test_df.columns != 'is_fraud']
y_test = test_df['is_fraud']

# Standardize numerical columns
scaler_train = StandardScaler()
numeric_cols_train = X_train.select_dtypes(include=['number'])
X_train[numeric_cols_train.columns] = scaler_train.fit_transform(numeric_cols_train)

scaler_test = StandardScaler()
numeric_cols_test = X_test.select_dtypes(include=['number'])
X_test[numeric_cols_test.columns] = scaler_test.fit_transform(numeric_cols_test)

### Pipeline - Random Forest with Lasso 

Using the one-standard-error rule to pick out an alpha that is better for prediction models and will not overfit.

In [13]:
# Identify categorical columns
categorical_columns = X_train.select_dtypes(include=['object', 'category']).columns
categorical_columns_idx = [X_train.columns.get_loc(c) for c in categorical_columns]

# 1. First define your pipeline
pipeline = ImbPipeline([
    # First balance the dataset
    ('undersample', RandomUnderSampler(sampling_strategy=0.1, random_state=123)),
    ('smote', SMOTENC(categorical_features=categorical_columns_idx, random_state=123, sampling_strategy=0.5)),
    
    # Apply one-hot encoding
    ('onehot', ColumnTransformer([
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_columns_idx)
    ], remainder='passthrough')),
    
    # Then feature selection with Lasso
    ('feature_selection', SelectFromModel(Lasso(alpha=0.01, random_state=123))),
    
    # Finally the classifier
    ('classifier', RandomForestClassifier(random_state=123))
])

# 2. Define parameter grid
param_grid = {
    'classifier__n_estimators': [100, 150, 200], 
    'classifier__max_features': ['sqrt', 'log2', 20],
    'classifier__max_depth': [10, 15],
    'classifier__criterion': ['gini', 'entropy'],
    # Expanded alpha range to better apply the one-standard-error rule
    'feature_selection__estimator__alpha': [0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2]
}

# 3. Set up GridSearchCV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    n_jobs=-1,
    verbose=2,
    scoring='f1'
)



In [None]:
# 4. Fit the grid search
print("Running grid search...")
grid_search.fit(X_train, y_train)

# Report best parameters from standard approach
print(f'Best hyperparameters from standard approach: {grid_search.best_params_}')
print(f'Best score from standard approach: {grid_search.best_score_:.4f}')



Running grid search...
Fitting 5 folds for each of 252 candidates, totalling 1260 fits
[CV] END classifier__criterion=gini, classifier__max_depth=10, classifier__max_features=sqrt, classifier__n_estimators=100, feature_selection__estimator__alpha=0.005; total time= 1.9min
[CV] END classifier__criterion=gini, classifier__max_depth=10, classifier__max_features=sqrt, classifier__n_estimators=100, feature_selection__estimator__alpha=0.005; total time= 5.3min
[CV] END classifier__criterion=gini, classifier__max_depth=10, classifier__max_features=sqrt, classifier__n_estimators=100, feature_selection__estimator__alpha=0.005; total time= 5.4min
[CV] END classifier__criterion=gini, classifier__max_depth=10, classifier__max_features=sqrt, classifier__n_estimators=100, feature_selection__estimator__alpha=0.001; total time= 8.1min
[CV] END classifier__criterion=gini, classifier__max_depth=10, classifier__max_features=sqrt, classifier__n_estimators=100, feature_selection__estimator__alpha=0.001; to

In [None]:
# 5. AFTER fitting, apply the one-standard-error rule
print("\nApplying one-standard-error rule for alpha selection:")
cv_results = pd.DataFrame(grid_search.cv_results_)

# Group results by alpha value to find performance for each alpha
alpha_results = {}
for i, row in cv_results.iterrows():
    alpha = row['params']['feature_selection__estimator__alpha']
    if alpha not in alpha_results:
        alpha_results[alpha] = []
    
    # Collect all scores for this alpha across different RF parameters
    mean_score = row['mean_test_score']
    alpha_results[alpha].append(mean_score)

# Calculate mean and std for each alpha
alpha_stats = {}
for alpha, scores in alpha_results.items():
    alpha_stats[alpha] = {
        'mean': np.mean(scores),
        'max': np.max(scores),  # Best possible score for this alpha
        'std': np.std(scores)
    }

# Find the alpha with the best performance
best_alpha = max(alpha_stats.items(), key=lambda x: x[1]['max'])
best_alpha_value = best_alpha[0]
best_alpha_max_score = best_alpha[1]['max']
best_alpha_std = best_alpha[1]['std']

print(f"Alpha with best performance: {best_alpha_value}")
print(f"Best score: {best_alpha_max_score:.4f}")
print(f"Standard deviation: {best_alpha_std:.4f}")

# Find the largest alpha within one standard deviation of the best
one_std_threshold = best_alpha_max_score - best_alpha_std
print(f"One standard deviation threshold: {one_std_threshold:.4f}")

valid_alphas = []
for alpha, stats in alpha_stats.items():
    if stats['max'] >= one_std_threshold:
        valid_alphas.append((alpha, stats['max']))

# Sort by alpha (descending) to get the largest alpha that meets the criterion
valid_alphas.sort(reverse=True)
for alpha, score in valid_alphas:
    print(f"Alpha: {alpha}, Max Score: {score:.4f}")

# Select the largest alpha within one standard deviation of the best
selected_alpha = valid_alphas[0][0]
print(f"\nSelected alpha using one-standard-error rule: {selected_alpha}")

# Find the best parameter set for the selected alpha
selected_params = None
best_score_for_alpha = -float('inf')

for i, row in cv_results.iterrows():
    if row['params']['feature_selection__estimator__alpha'] == selected_alpha:
        if row['mean_test_score'] > best_score_for_alpha:
            best_score_for_alpha = row['mean_test_score']
            selected_params = row['params']

print(f"Best parameters for selected alpha: {selected_params}")
print(f"Score with selected parameters: {best_score_for_alpha:.4f}")



In [None]:
# Now retrain the model with the selected parameters
print("\nRetraining model with selected parameters...")
pipeline.set_params(**selected_params)
pipeline.fit(X_train, y_train)

# Extract feature importances
feature_selector = pipeline.named_steps['feature_selection']
classifier = pipeline.named_steps['classifier']

# Create feature names
categorical_cols = categorical_columns.tolist()
numeric_cols = [col for col in X_train.columns if col not in categorical_cols]

# Generate encoded feature names
encoded_feature_names = []
for col in categorical_cols:
    unique_vals = X_train[col].unique()[1:]  # Skip first value due to drop_first=True
    encoded_feature_names.extend([f"{col}_{val}" for val in unique_vals])
encoded_feature_names.extend(numeric_cols)  # Add numeric columns

# Get the selected features
support = feature_selector.get_support()
if len(encoded_feature_names) == len(support):
    selected_features = [encoded_feature_names[i] for i in range(len(support)) if support[i]]
else:
    # Fallback if feature names don't match
    selected_features = [f"feature_{i}" for i in range(len(support)) if support[i]]

# Print feature importances for selected features
importances = classifier.feature_importances_
feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Importance': importances
})

print('\nTop 10 most important features:')
print(feature_importance.sort_values(by='Importance', ascending=False)[:10])



In [None]:
# Evaluate with cross-validation using the selected parameters
sk_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
y_pred = cross_val_predict(pipeline, X_train, y_train, cv=sk_folds)

# Calculate performance metrics
accuracy = accuracy_score(y_train, y_pred)
precision = precision_score(y_train, y_pred, average='macro', zero_division=0.0)
recall = recall_score(y_train, y_pred, average='macro')
f1 = f1_score(y_train, y_pred, average='macro', zero_division=0.0)
auc = roc_auc_score(y_train, y_pred)

print("\nCross-Validation Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")

# Extract fold scores for the selected parameters
selected_idx = cv_results[cv_results['params'].apply(lambda x: all(x[k] == v for k, v in selected_params.items()))].index[0]
fold_scores = []
for i in range(sk_folds.n_splits):
    fold_score = cv_results.loc[selected_idx, f'split{i}_test_score']
    fold_scores.append(fold_score)
    print(f"Fold {i+1}: F1 = {fold_score:.4f}")

print(f"\nMean F1 across folds: {np.mean(fold_scores):.4f}")
print(f"Standard deviation of F1 across folds: {np.std(fold_scores):.4f}")

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_train, y_pred)

# Format text in boxes
group_counts = ["{0:g}".format(value) for value in cm.flatten()]
group_percentages = ["{0:.3f}".format(value) for value in cm.flatten()/np.sum(cm)]
values = [f"{v1}\n{v2}" for v1, v2 in zip(group_counts, group_percentages)]
values = np.asarray(values).reshape(2, 2)

# Create plot
ax = plt.subplot()
sns.heatmap(cm, annot=values, fmt='')

ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix (Cross-Validation)')
ax.xaxis.set_ticklabels(['Not fraud', 'Fraud'])
ax.yaxis.set_ticklabels(['Not fraud', 'Fraud'], rotation=0)
plt.tight_layout()
plt.show()