In [None]:
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [None]:
from helper_data_preprocessing import all_permutations, build_poly, random_undersampling
from helper_inference import test_evaluation
from helper_metrics import compute_metrics, compute_confusion_matrix, threshold_plot
from helpers_submission import load_data, append_row, create_file
from helper_training import best_parameters_selection
from experiments import ensemble_logistic_regression_experiment, reg_logistic_regression_experiment

## Grid Search

In [None]:
# Constants
np.random.seed(42) # Global Seed
DATA_PATH = r"../ML_course/projects/project1/data/new_dataset/"
CV_FOLDS = 3
DATAS = ["train_raw.npy", "train_data_full.npy", "train_lasso.npy", "train_data_subset.npy", "train_data_curated.npy"]
FILENAME = "SECOND_TUNING.csv"

# Params for large logistic regression class
# FIRST ROUND
PARAMS1 = [
    [1.0, 0.1, 0.01, 0.,], # lambdas1 for ridge # 100 is too high
    [5., 1.0, 0.1, 0., ], #lambda2 for lasso
    [1000], # max_iters
    [0.05,], # gamma
    [1_028], # batchsize
    [False], # plotting
    [False], # add a log transform - must be False as we have -1 still in the data    
    [1, 3, 5], # degree, must always be second to last!!
    [(4, 2), (3, 3), (1, 2), (2, 2), (3, 2), (2, 3), ], # majority_to_minority_ratio, minority oversampling (0 = no sampling), must always be last!!
    [0.5], # classification threshold
]

PARAMS2 = [
    [1.0, 0.1, 0.01, 0.,], # lambdas1 for ridge # 100 is too high
    [5., 1.0, 0.1, 0., ], #lambda2 for lasso
    [1000], # max_iters
    [0.05,], # gamma
    [1_028], # batchsize
    [False], # plotting
    [True], # add a log transform - must be False as we have -1 still in the data    
    [1, 3, 5], # degree, must always be second to last!!
    [(4, 2), (3, 3), (1, 2), (2, 2), (3, 2), (2, 3), ], # majority_to_minority_ratio, minority oversampling (0 = no sampling), must always be last!!
    [0.5], # classification threshold
]

# SECOND ROUND
FINER_SEARCH_1 = [
    [0.5, 0.1, 0.05, 0.01,], # lambdas1 for ridge
    [0.5, 0.1, 0.05, 0.01,], #lambda2 for lasso
    [1000], # max_iters
    [0.05,], # gamma
    [1_028], # batchsize
    [False], # plotting    
    [False], # add a log transform - must be False as we have -1 still in the data   
    [2, 3, 4, 5, 6], # degree, must always be second to last!!
    [(3, 2),(3, 3),], # majority_to_minority_ratio undersampling (0 = no sampling), must always be last!!
    [0.5, ], # classification threshold
]

FINER_SEARCH_2 = [
    [0.5, 0.1, 0.05, 0.01,], # lambdas1 for ridge
    [0.5, 0.1, 0.05, 0.01,], #lambda2 for lasso
    [1000], # max_iters
    [0.05,], # gamma
    [1_028], # batchsize
    [False], # plotting    
    [True], # add a log transform - must be False as we have -1 still in the data   
    [2, 3, 4, 5, 6], # degree, must always be second to last!!
    [(3, 2),(3, 3),], # majority_to_minority_ratio undersampling (0 = no sampling), must always be last!!
    [0.5, ], # classification threshold
]

In [None]:
def main():

    ####### Define Experiments ######
    exp1 = ("Logistic Regression with both regularization", reg_logistic_regression_experiment, all_permutations(FINER_SEARCH_1))
    exp2 = ("Logistic Regression with both regularization", reg_logistic_regression_experiment, all_permutations(FINER_SEARCH_2))
    EXPERIMENTS = [exp1, exp2]
    create_file(FILENAME)

    for path in DATAS:
        
        X_train, X_test, y_train, y_test = load_data(DATA_PATH, path)
        print("Dataset has shape: {}".format(X_train.shape))
        METRICS = ["Test Accuracy", "train Accuracy", "Test Loss", "Train Loss", "Test F1", "Train F1"]
        #print(X_train.shape)
        # Conduct Experiments
        for name, model, params in EXPERIMENTS:
            #print("-"*15, name, "-"*15)
            try:
                metrics, metrics_std = best_parameters_selection(model, X_train, y_train, CV_FOLDS, params, seed = 1)
                #print("Finished tuning {}".format(name))
                print(metrics)
                #print(params)
                for row, row_std, params in zip(metrics, metrics_std, params):

                    # append experiment results to csv file
                    append_row(FILENAME, name, params, row, data=path)

                    # also print the results here
                    for name_m, val, std in zip(METRICS, row, row_std):
                        print("{:>40}: {} ({})".format(name_m, round(val, 5), round(std, 3)))
            except Exception as e:
                print("An Error occured for {}".format(name))
                print(e)


In [None]:
if __name__=="__main__":
    main()

## Evalute Current Best Models after First Round

In [None]:
path = "train_data_full.npy"
params = [1.0, 1.0, 1000, 0.05, 1028, True, False, 4, (3, 2), 0.5] # 3,2 is best
X_train, X_test, y_train, y_test = load_data(DATA_PATH, path) # 43.127 to beat

test_evaluation(X_train, y_train, X_test, y_test, reg_logistic_regression_experiment, params)

In [None]:
path = "train_data_curated.npy"
params = [0.1, 0.1, 1000, 0.05, 1028, True, False, 4, (3, 3), 0.5] # 3,2 is best
X_train, X_test, y_train, y_test = load_data(DATA_PATH, path) # 43.188 to beat

test_evaluation(X_train, y_train, X_test, y_test, reg_logistic_regression_experiment, params)

## Evaluate Best Models after Second Round

In [None]:
# best model
path = "train_data_full.npy"
params = [0.05, 0.5, 1000, 0.05, 1028, True, True, 2, (3, 2), 0.5] # 3,2 is best
X_train, X_test, y_train, y_test = load_data(DATA_PATH, path) # 43.429 to beat

test_evaluation(X_train, y_train, X_test, y_test, reg_logistic_regression_experiment, params)

In [None]:
# second best model

### Final Ensemble Model (0.4369)

In [None]:
## best params, 10 times but with bootrapping the data
path = "train_data_full.npy"
params = [0.05, 0.5, 1000, 0.05, 1028, False, True, 2, (3, 2)] # 3,2 is best
X_train, X_test, y_train, y_test = load_data(DATA_PATH, path) # 43.55 to beat with bootstrapping 0.8, 0.4 slightly better even (0.4369), 0.4 is the best
threshold = 0.5
ensemble_params = [params]*10 # 10 times the best model
# test 00.4, and test median ensemble as well- 
avg_pred_te, ensemble_opt_w, avg_loss_tr, avg_loss_te, _, ensemble_pred_te, median_votes = ensemble_logistic_regression_experiment(y_train, y_test, X_train, X_test, ensemble_params, row_subsample=0.4, col_subsample=1)

random_y = np.zeros(y_train.shape)
print("Averaging")
acc_te, acc_tr, f1_te, f1_tr = compute_metrics(y_test, ensemble_pred_te, y_train, random_y, threshold)
print("F1_Test: {}".format(f1_te))
print("Median Voting")
acc_te, acc_tr, f1_te, f1_tr = compute_metrics(y_test, median_votes, y_train, random_y, threshold)
print("F1_Test: {}".format(f1_te))

compute_confusion_matrix(y_test, ensemble_pred_te, threshold)
threshold_plot(ensemble_pred_te, y_test)

In [None]:
## best params 5 models in ensemble
path = "train_data_full.npy"
X_train, X_test, y_train, y_test = load_data(DATA_PATH, path) # 43.33 - worse than just taking the best one
threshold = 0.5
# Top 5 parameters
ensemble_params = [[0.05, 0.5, 1000, 0.05, 1028, False, True, 2, (3, 2)],
                    [0.01, 0.5, 1000, 0.05, 1028, False, True, 2, (3, 2)],
                    [0.5, 0.5, 1000, 0.05, 1028, False, True, 2, (3, 3)],
                    [0.1, 0.1, 1000, 0.05, 1028, False, False, 2, (3, 3)],
                    [0.01, 0.1, 1000, 0.05, 1028, False, False, 2, (3, 3)]]
ensemble_params = ensemble_params*2 # every model twice 

avg_pred_te, ensemble_opt_w, avg_loss_tr, avg_loss_te, _, ensemble_pred_te, median_votes = ensemble_logistic_regression_experiment(y_train, y_test, X_train, X_test, ensemble_params, row_subsample=0.8, col_subsample=1)

random_y = np.zeros(y_train.shape)
print("Averaging")
acc_te, acc_tr, f1_te, f1_tr = compute_metrics(y_test, ensemble_pred_te, y_train, random_y, threshold)
print("F1_Test: {}".format(f1_te))
print("Median Voting")
acc_te, acc_tr, f1_te, f1_tr = compute_metrics(y_test, median_votes, y_train, random_y, threshold)
print("F1_Test: {}".format(f1_te))
compute_confusion_matrix(y_test, ensemble_pred_te, threshold)
threshold_plot(ensemble_pred_te, y_test)

## View Classification Threshold

In [None]:
path = "train_data_full.npy"
params = [0.05, 0.5, 1000, 0.05, 1028, True, True, 2, (3, 2), 0.5]
X_train, X_test, y_train, y_test = load_data(DATA_PATH, path)

In [None]:
def f_score_evaluation(X_train, y_train, X_test, y_test, model, model_parameters):
    
    # extract data modification parameters
    model_parameters.pop()
    majority_to_minority_ratio, minority_multiplier = model_parameters[-1]
    degree = model_parameters[-2]
    log_transform = model_parameters[-3]

    # only access model_parameters 
    parameters = model_parameters[:-3]
    
    # upsample if needed
    if majority_to_minority_ratio != 0:
        X_train, y_train = random_undersampling(X_train, y_train, majority_to_minority_ratio, minority_multiplier)
    
    log_x_tr = np.log(X_train+1)
    log_x_te= np.log(X_test+1)

    # build polynomial feature matrix
    X_train = build_poly(X_train, degree)
    X_test = build_poly(X_test, degree)

    # TODO: add log transform
    if log_transform:
        X_train = np.column_stack((X_train, log_x_tr))
        X_test = np.column_stack((X_test, log_x_te))

    # train the model
    _, _, _, pred_tr, pred_te = model(y_train, y_test, X_train, X_test, *parameters)

    pred_tr = pred_tr[:,0]
    pred_te = pred_te[:,0]

    f_score = []
    classification_thresholds = np.linspace(0, 1, 100)
    for classification_threshold in classification_thresholds:
        metrics = compute_metrics(y_test, pred_te, y_train, pred_tr, classification_threshold)
        f_score.append(metrics[2])  # metrics[2] -> test f-score

    # Create a line plot to visualize how F1 score changes with different thresholds
    plt.figure(figsize=(10, 6))
    plt.plot(classification_thresholds, f_score, marker='o')
    plt.xlabel('Classification Threshold')
    plt.ylabel('F1 Score')
    plt.title('F1 Score vs. Classification Threshold')
    plt.grid(True)
    plt.show()

    return f_score

In [None]:
f_score_evaluation(X_train, y_train, X_test, y_test, reg_logistic_regression_experiment, params)