In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt
import optuna
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# Load the dataset from the URL
url = 'https://raw.githubusercontent.com/alvarofavale/week7_ml/refs/heads/main/data/encoded/encoded_data.csv'
df = pd.read_csv(url)

# Set display option to show all columns
pd.set_option('display.max_columns', None)

df.head()

Unnamed: 0,id,customer_id,month,name,age,ssn,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,delay_from_due_date,num_of_delayed_payment,changed_credit_limit,num_credit_inquiries,credit_mix,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score
0,5634,3392,1,Aaron Maashoh,23.0,821000265.0,1,19114.12,1824.843333,3.0,4.0,3.0,4.0,760.0,3.0,7.0,11.27,4.0,1,809.98,26.82262,265.0,1,49.574949,21.46538,1,312.494089,0
1,5635,3392,2,Aaron Maashoh,23.0,821000265.0,1,19114.12,1824.843333,3.0,4.0,3.0,4.0,760.0,3.0,4.0,11.27,4.0,1,809.98,31.94496,266.0,1,49.574949,21.46538,2,284.629162,0
2,5636,3392,3,Aaron Maashoh,23.0,821000265.0,1,19114.12,1824.843333,3.0,4.0,3.0,4.0,760.0,3.0,7.0,11.27,4.0,1,809.98,28.609352,267.0,1,49.574949,21.46538,3,331.209863,0
3,5637,3392,4,Aaron Maashoh,23.0,821000265.0,1,19114.12,1824.843333,3.0,4.0,3.0,4.0,760.0,5.0,4.0,6.27,4.0,1,809.98,31.377862,268.0,1,49.574949,21.46538,4,223.45131,0
4,5638,3392,5,Aaron Maashoh,23.0,821000265.0,1,19114.12,1824.843333,3.0,4.0,3.0,4.0,760.0,6.0,4.0,11.27,4.0,1,809.98,24.797347,269.0,1,49.574949,21.46538,5,341.489231,0


In [4]:
# Define features (X) and target (y) for classification
X = df.drop(columns=["credit_score", 'name'])  # Features: Drop 'credit_score' and 'name'
y = df["credit_score"]  # Target: 'credit_score' column

In [5]:
# Create an instance of the MinMaxScaler
normalizer = MinMaxScaler()

# Fit and transform the features in your dataset (scaling the data)
X_scaled = normalizer.fit_transform(X)

In [None]:
# Function to evaluate the KNN model for hyperparameter optimization
def evaluate_knn_model(X_train, X_test, y_train, y_test, model):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    return accuracy, class_report, conf_matrix

In [None]:
# Hyperparameter search with GridSearchCV
def grid_search_knn(X_train, X_test, y_train, y_test, param_grid):
    print("\nPerforming Grid Search...")
    grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, verbose=1)
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters found: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
    
    # Evaluate on the test set
    best_model = grid_search.best_estimator_
    accuracy, class_report, conf_matrix = evaluate_knn_model(X_train, X_test, y_train, y_test, best_model)
    
    return best_model, accuracy, class_report, conf_matrix

In [None]:
# Hyperparameter search with RandomizedSearchCV
def randomized_search_knn(X_train, X_test, y_train, y_test, param_dist, n_iter=100):
    print("\nPerforming Randomized Search...")
    randomized_search = RandomizedSearchCV(KNeighborsClassifier(), param_dist, n_iter=n_iter, cv=5, verbose=1)
    randomized_search.fit(X_train, y_train)
    
    print(f"Best parameters found: {randomized_search.best_params_}")
    print(f"Best cross-validation score: {randomized_search.best_score_:.4f}")
    
    # Evaluate on the test set
    best_model = randomized_search.best_estimator_
    accuracy, class_report, conf_matrix = evaluate_knn_model(X_train, X_test, y_train, y_test, best_model)
    
    return best_model, accuracy, class_report, conf_matrix

In [None]:
# Hyperparameter search with Optuna (Bayesian optimization)
def optuna_search_knn(X_train, X_test, y_train, y_test, n_trials=100):
    print("\nPerforming Bayesian Optimization with Optuna...")
    
    def objective(trial):
        n_neighbors = trial.suggest_int('n_neighbors', 1, 15)
        weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
        p = trial.suggest_int('p', 1, 2)
        
        model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, p=p)
        accuracy = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
        return np.mean(accuracy)
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)
    
    print(f"Best parameters found: {study.best_params}")
    print(f"Best cross-validation score: {study.best_value:.4f}")
    
    # Train and evaluate on the test set
    best_params = study.best_params
    best_model = KNeighborsClassifier(**best_params)
    best_model.fit(X_train, y_train)
    
    accuracy, class_report, conf_matrix = evaluate_knn_model(X_train, X_test, y_train, y_test, best_model)
    
    return best_model, accuracy, class_report, conf_matrix

In [None]:
# Helper function to plot confusion matrix
def plot_confusion_matrix(conf_matrix, split_percentage, test_type, folder_path='figures_correlations'):
    plt.figure(figsize=(6, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
    plt.title(f'Confusion Matrix for {split_percentage}%/100-{100-split_percentage}% Split ({test_type})')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    
    # Ensure the folder exists
    os.makedirs(folder_path, exist_ok=True)
    
    # Save plot to file
    plot_filename = f'confusion_matrix_{test_type}_{split_percentage}.png'
    plot_path = os.path.join(folder_path, plot_filename)
    plt.savefig(plot_path)
    plt.show()

In [None]:
# Function to perform KNN with the specified split percentage
def run_knn_with_split(X, y, split_percentage, param_grid=None, param_dist=None, optuna_trials=100, test_type='GridSearch'):
    print(f"\nRunning KNN Classifier with {split_percentage}%/100-{100-split_percentage}% split and {test_type}:\n")

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-split_percentage/100, random_state=42)
    
    # Normalize the data
    normalizer = MinMaxScaler()
    X_train_scaled = normalizer.fit_transform(X_train)
    X_test_scaled = normalizer.transform(X_test)
    
    # Run hyperparameter optimization using the specified method
    if param_grid:
        best_model, accuracy, class_report, conf_matrix = grid_search_knn(X_train_scaled, X_test_scaled, y_train, y_test, param_grid)
    elif param_dist:
        best_model, accuracy, class_report, conf_matrix = randomized_search_knn(X_train_scaled, X_test_scaled, y_train, y_test, param_dist)
    else:
        best_model, accuracy, class_report, conf_matrix = optuna_search_knn(X_train_scaled, X_test_scaled, y_train, y_test, n_trials=optuna_trials)
    
    # Print results
    print(f"Test Accuracy: {accuracy:.3f}")
    print(f"Classification Report:\n{class_report}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    
    # Plot confusion matrix and save
    plot_confusion_matrix(conf_matrix, split_percentage, test_type)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'n_neighbors': np.arange(1, 15),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

In [None]:
# Run the optimization for the 70/30 split using GridSearchCV
run_knn_with_split(X_scaled, y, 70, param_grid=param_grid, test_type='GridSearch')

# Run the optimization for the 70/30 split using RandomizedSearchCV
run_knn_with_split(X_scaled, y, 70, param_dist=param_dist, test_type='RandomizedSearch')

# Run the optimization for the 70/30 split using Optuna (Bayesian optimization)
run_knn_with_split(X_scaled, y, 70, optuna_trials=100, test_type='Optuna')

# Run the optimization for the 80/20 split using GridSearchCV
run_knn_with_split(X_scaled, y, 80, param_grid=param_grid, test_type='GridSearch')

# Run the optimization for the 80/20 split using RandomizedSearchCV
run_knn_with_split(X_scaled, y, 80, param_dist=param_dist, test_type='RandomizedSearch')

# Run the optimization for the 80/20 split using Optuna (Bayesian optimization)
run_knn_with_split(X_scaled, y, 80, optuna_trials=100, test_type='Optuna')