## Experiments

*Elżbieta Jowik* <br>
*Agata Makarewicz*

In [1]:
# imports
import sys
from utils import NotebookFinder
sys.meta_path.append(NotebookFinder())

import os
from LogisticRegression import LogisticRegression
from ClassificationEvaluator import ClassificationEvaluator

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier

from IPython.display import display, Markdown

import warnings
warnings.filterwarnings('ignore')

data_directory='../datasets/preprocessed'

### Data loading

In [2]:
frames={}
for filename in os.listdir(data_directory):
    # display(Markdown(f'### {filename} data'))
    data=pd.read_csv(f"{data_directory}/{filename}")
    # display(data.head(3))
    frames[filename.split('.')[0]]=np.array(data)
    
keys_lst=list(frames.keys())
subsets_lst=[tuple(keys_lst[i:i + 4]) for i in range(0, len(keys_lst), 4)]

### Experiments setup

In [3]:
# default parameters
b1=0.9
b2=0.999
eps=1e-8
sample_size = 1
learning_rate = 1e-4

### 1. Convergence analysis: how the value of log-likelihood function depends on the number of iterations?

In [5]:
patience=None
min_delta=None
iterations = 1000 

results={}
logreg = LogisticRegression()

In [6]:
for item in subsets_lst:
        
    X_test_key, y_test_key, X_train_key, y_train_key = item
    X_test, y_test, X_train, y_train = frames[X_test_key], frames[y_test_key], frames[X_train_key], frames[y_train_key]
    X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
    
    dataset_name=X_train_key.split("_")[0]
    
    irls_kwds={"iterations": iterations, "min_delta":min_delta, "patience":patience}
    gd_kwds={"iterations": iterations, "alpha": learning_rate, "min_delta":min_delta, "patience":patience}
    sgd_kwds={"iterations": iterations, "alpha": learning_rate, "sample_size": sample_size,
        "min_delta":min_delta,"patience":patience}
    adam_kwds={"iterations": iterations, "b1": b1, "b2": b2, "alpha": learning_rate, "epsilon": eps,
        "min_delta":min_delta, "patience":patience}
    
    if dataset_name != 'adult':
        _,_,_,irls_cost_history=logreg.fit(X_train_b,y_train,**irls_kwds)
    _,_,_,gd_cost_history=logreg.fit(X_train_b,y_train,**gd_kwds)
    _,_,_,sgd_cost_history=logreg.fit(X_train_b,y_train,**sgd_kwds)
    _,_,_,adam_cost_history=logreg.fit(X_train,y_train,**adam_kwds)
    
    if dataset_name != 'adult':
        results[f"irls_{dataset_name}"]=irls_cost_history
    results[f"gd_{dataset_name}"]=gd_cost_history
    results[f"sgd_{dataset_name}"]=sgd_cost_history
    results[f"adam_{dataset_name}"]=adam_cost_history

results = pd.DataFrame(results)
results.index.name = 'n_iter'
results.to_csv('./../results/convergence_analysis_results.csv')

### 2.1 Learning rate evaluation

In [7]:
b1=0.9
b2=0.999
eps=1e-8
sample_size = 1
learning_rate = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2]

patience=None
min_delta=None
iterations = 1000 

logreg = LogisticRegression()

In [8]:
for item in subsets_lst:
    
    results = {}
    gd_evaluation_acc, sgd_evaluation_acc, adam_evaluation_acc = [], [], []
    gd_evaluation_f1, sgd_evaluation_f1, adam_evaluation_f1 = [], [], []
    cost_results = {}
    gd_history, sgd_history, adam_history = [], [], []

    X_test_key, y_test_key, X_train_key, y_train_key = item
    X_test, y_test, X_train, y_train = frames[X_test_key], frames[y_test_key], frames[X_train_key], frames[y_train_key]
    X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
    y_test = [y for row in y_test for y in row]

    dataset_name=X_train_key.split("_")[0]
    
    for lr in learning_rate:
        gd_kwds={"iterations": iterations, "alpha": lr, "min_delta":min_delta, "patience":patience}
        sgd_kwds={"iterations": iterations, "alpha": lr, "sample_size": sample_size,
            "min_delta":min_delta,"patience":patience}
        adam_kwds={"iterations": iterations, "b1": b1, "b2": b2, "alpha": lr, "epsilon": eps,
            "min_delta":min_delta, "patience":patience}

        gd_params, _, _, gd_cost_history = logreg.fit(X_train_b,y_train,**gd_kwds)
        _, gd_pred = logreg.predict(X_test_b, gd_params)
        gd_metrics_acc = ClassificationEvaluator().accuracy(y_test,gd_pred)
        gd_metrics_f1 = ClassificationEvaluator().f1_score(y_test,gd_pred)

        sgd_params, _, _, sgd_cost_history = logreg.fit(X_train_b,y_train,**sgd_kwds)
        _, sgd_pred = logreg.predict(X_test_b, sgd_params)
        sgd_metrics_acc = ClassificationEvaluator().accuracy(y_test,sgd_pred)
        sgd_metrics_f1 = ClassificationEvaluator().f1_score(y_test,sgd_pred)

        adam_params, _, _, adam_cost_history = logreg.fit(X_train,y_train,**adam_kwds)
        _, adam_pred = logreg.predict(X_test, adam_params[0], adam_params[1])
        adam_metrics_acc = ClassificationEvaluator().accuracy(y_test,adam_pred)
        adam_metrics_f1 = ClassificationEvaluator().f1_score(y_test,adam_pred)
        
        gd_evaluation_acc.append(gd_metrics_acc)
        sgd_evaluation_acc.append(sgd_metrics_acc)
        adam_evaluation_acc.append(adam_metrics_acc)

        gd_evaluation_f1.append(gd_metrics_f1)
        sgd_evaluation_f1.append(sgd_metrics_f1)
        adam_evaluation_f1.append(adam_metrics_f1)

        gd_history.append(gd_cost_history)
        sgd_history.append(sgd_cost_history)
        adam_history.append(adam_cost_history)

    results_acc = pd.concat([pd.Series(gd_evaluation_acc), pd.Series(sgd_evaluation_acc), pd.Series(adam_evaluation_acc)], axis=1)
    results_acc.columns = ['gd', 'sgd', 'adam']
    results_acc.index = learning_rate
    results_acc.index.name = "Learning rate"
    results_acc_melted = pd.melt(results_acc.reset_index(), id_vars=['Learning rate'], value_vars=['gd', 'sgd', 'adam'])
    results_acc_melted.columns=['Learning rate', 'Algorithm', 'Accuracy']

    results_acc_melted.to_csv(f"./../results/learning_rate_acc_{dataset_name}.csv")

    results_f1 = pd.concat([pd.Series(gd_evaluation_f1), pd.Series(sgd_evaluation_f1), pd.Series(adam_evaluation_f1)], axis=1)
    results_f1.columns = ['gd', 'sgd', 'adam']
    results_f1.index = learning_rate
    results_f1.index.name = "Learning rate"
    results_f1_melted = pd.melt(results_f1.reset_index(), id_vars=['Learning rate'], value_vars=['gd', 'sgd', 'adam'])
    results_f1_melted.columns=['Learning rate', 'Algorithm', 'F1-Score']

    results_f1_melted.to_csv(f"./../results/learning_rate_f1_{dataset_name}.csv")

    cost_results = pd.concat([pd.DataFrame(gd_history).T, pd.DataFrame(sgd_history).T, pd.DataFrame(adam_history).T], axis=1)
    columns = []
    for alg in ['gd', 'sgd', 'adam']:
        for l in learning_rate:
            l_e = '{:.0e}'.format(l)
            columns.append(alg+f"_{l_e}")
    cost_results.columns = columns
    cost_results.index.name = 'n_iter'
    cost_results.to_csv(f'./../results/learning_rate_cost_{dataset_name}.csv')

### 2.2 ADAM b1/b2 evaluation

In [11]:
b1s=[0.5, 0.6, 0.7, 0.8, 0.9, 0.999]
b2s=[0.5, 0.6, 0.7, 0.8, 0.9, 0.999]
eps=1e-8 
sample_size = 1
learning_rate = 1e-4

patience=None
min_delta=None
iterations = 1000

results=pd.DataFrame()
logreg = LogisticRegression()

In [12]:
for item in subsets_lst:
    
    adam_evaluation = []
    
    X_test_key, y_test_key, X_train_key, y_train_key = item
    X_test, y_test, X_train, y_train = frames[X_test_key], frames[y_test_key], frames[X_train_key], frames[y_train_key]
    X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
    y_test = [y for row in y_test for y in row]

    dataset_name=X_train_key.split("_")[0]
    
    for b1 in b1s:
        adam_b1_evaluation = []
        for b2 in b2s:
            adam_kwds={"iterations": iterations, "b1": b1, "b2": b2, "alpha": lr, "epsilon": eps,
            "min_delta":min_delta, "patience":patience}

            adam_params, _, _, _ = logreg.fit(X_train,y_train,**adam_kwds)
            _, adam_pred = logreg.predict(X_test, adam_params[0], adam_params[1])
            adam_metrics = ClassificationEvaluator().accuracy(y_test,adam_pred)

            adam_b1_evaluation.append(adam_metrics)
        adam_evaluation.append(adam_b1_evaluation)

    results = pd.DataFrame(adam_evaluation)
    results.columns = b2s
    results.index = b1s
    results.index.name = "b1"

    results.to_csv(f"./../results/adam_b1_b2_{dataset_name}.csv")

### 3. Benchmark with LDA, QDA & KNN

In [13]:
b1=0.9
b2=0.999
eps=1e-8
sample_size = 1
learning_rate = 1e-4 

patience=None 
min_delta=None 
iterations = 1000 

logreg = LogisticRegression()

In [14]:
lda = LDA()
qda = QDA()
knn = KNeighborsClassifier(n_neighbors=20)

In [15]:
for item in subsets_lst:
    
    X_test_key, y_test_key, X_train_key, y_train_key = item
    X_test, y_test, X_train, y_train = frames[X_test_key], frames[y_test_key], frames[X_train_key], frames[y_train_key]
    X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
    y_test = [y for row in y_test for y in row]

    dataset_name=X_train_key.split("_")[0]
    
    irls_kwds={"iterations": iterations, "min_delta":min_delta, "patience":patience}
    gd_kwds={"iterations": iterations, "alpha": learning_rate, "min_delta":min_delta, "patience":patience}
    sgd_kwds={"iterations": iterations, "alpha": learning_rate, "sample_size": sample_size,
        "min_delta":min_delta,"patience":patience}
    adam_kwds={"iterations": iterations, "b1": b1, "b2": b2, "alpha": learning_rate, "epsilon": eps,
        "min_delta":min_delta, "patience":patience}
    
    if dataset_name != 'adult':
        irls_params, _, _, _ = logreg.fit(X_train_b,y_train,**irls_kwds)
        _, irls_pred = logreg.predict(X_test_b, irls_params)
        irls_metrics = ClassificationEvaluator().calculate_all(y_test,irls_pred)

    gd_params, _, _, _ = logreg.fit(X_train_b,y_train,**gd_kwds)
    _, gd_pred = logreg.predict(X_test_b, gd_params)
    gd_metrics = ClassificationEvaluator().calculate_all(y_test,gd_pred)

    sgd_params, _, _, _ = logreg.fit(X_train_b,y_train,**sgd_kwds)
    _, sgd_pred = logreg.predict(X_test_b, sgd_params)
    sgd_metrics = ClassificationEvaluator().calculate_all(y_test,sgd_pred)

    adam_params, _, _, _ = logreg.fit(X_train,y_train,**adam_kwds)
    _, adam_pred = logreg.predict(X_test, adam_params[0], adam_params[1])
    adam_metrics = ClassificationEvaluator().calculate_all(y_test,adam_pred)

    lda.fit(X_train, y_train)
    lda_pred = lda.predict(X_test)
    lda_metrics = ClassificationEvaluator().calculate_all(y_test,lda_pred)

    qda.fit(X_train, y_train)
    qda_pred = qda.predict(X_test)
    qda_metrics = ClassificationEvaluator().calculate_all(y_test,qda_pred)

    knn.fit(X_train, y_train)
    knn_pred = knn.predict(X_test)
    knn_metrics = ClassificationEvaluator().calculate_all(y_test,knn_pred)

    index=['LR - IRLS', 'LR - GD', 'LR - SGD', 'LR - ADAM', 'LDA', 'QDA', 'KNN']
    if dataset_name == 'adult':
        results = pd.DataFrame([gd_metrics, sgd_metrics, adam_metrics, lda_metrics, qda_metrics, knn_metrics]) 
        index.pop(0)
    else:
        results = pd.DataFrame([irls_metrics, gd_metrics, sgd_metrics, adam_metrics, lda_metrics, qda_metrics, knn_metrics]) 
    results.index=index
    results.columns=['Accuracy', 'Precision', 'Recall', 'F1-Score']
    results = results.transpose()
    results.index.name="Metric"

    results.to_csv(f"./../results/comparison_with_popular_{dataset_name}.csv")

### 4. Early stopping

In [16]:
b1=0.9
b2=0.999
eps=1e-8
sample_size = 1
learning_rate = 1e-4 

patience=50
min_delta=1e-4 
iterations = 1000  

logreg = LogisticRegression()

In [17]:
lda = LDA()
qda = QDA()
knn = KNeighborsClassifier(n_neighbors=20)

In [18]:
for item in subsets_lst:
    
    X_test_key, y_test_key, X_train_key, y_train_key = item
    X_test, y_test, X_train, y_train = frames[X_test_key], frames[y_test_key], frames[X_train_key], frames[y_train_key]
    X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
    y_test = [y for row in y_test for y in row]

    dataset_name=X_train_key.split("_")[0]
    
    irls_kwds={"iterations": iterations, "min_delta":min_delta, "patience":patience}
    gd_kwds={"iterations": iterations, "alpha": learning_rate, "min_delta":min_delta, "patience":patience}
    sgd_kwds={"iterations": iterations, "alpha": learning_rate, "sample_size": sample_size,
        "min_delta":min_delta,"patience":patience}
    adam_kwds={"iterations": iterations, "b1": b1, "b2": b2, "alpha": learning_rate, "epsilon": eps,
        "min_delta":min_delta, "patience":patience}
    
    if dataset_name != 'adult':
        irls_params, _, _, _ = logreg.fit(X_train_b,y_train,**irls_kwds)
        _, irls_pred = logreg.predict(X_test_b, irls_params)
        irls_metrics = ClassificationEvaluator().calculate_all(y_test,irls_pred)

    gd_params, _, _, _ = logreg.fit(X_train_b,y_train,**gd_kwds)
    _, gd_pred = logreg.predict(X_test_b, gd_params)
    gd_metrics = ClassificationEvaluator().calculate_all(y_test,gd_pred)

    sgd_params, _, _, _ = logreg.fit(X_train_b,y_train,**sgd_kwds)
    _, sgd_pred = logreg.predict(X_test_b, sgd_params)
    sgd_metrics = ClassificationEvaluator().calculate_all(y_test,sgd_pred)

    adam_params, _, _, _ = logreg.fit(X_train,y_train,**adam_kwds)
    _, adam_pred = logreg.predict(X_test, adam_params[0], adam_params[1])
    adam_metrics = ClassificationEvaluator().calculate_all(y_test,adam_pred)

    lda.fit(X_train, y_train)
    lda_pred = lda.predict(X_test)
    lda_metrics = ClassificationEvaluator().calculate_all(y_test,lda_pred)

    qda.fit(X_train, y_train)
    qda_pred = qda.predict(X_test)
    qda_metrics = ClassificationEvaluator().calculate_all(y_test,qda_pred)

    knn.fit(X_train, y_train)
    knn_pred = knn.predict(X_test)
    knn_metrics = ClassificationEvaluator().calculate_all(y_test,knn_pred)

    index=['LR - IRLS', 'LR - GD', 'LR - SGD', 'LR - ADAM', 'LDA', 'QDA', 'KNN']
    if dataset_name == 'adult':
        results = pd.DataFrame([gd_metrics, sgd_metrics, adam_metrics, lda_metrics, qda_metrics, knn_metrics]) 
        index.pop(0)
    else:
        results = pd.DataFrame([irls_metrics, gd_metrics, sgd_metrics, adam_metrics, lda_metrics, qda_metrics, knn_metrics]) 
    results.index=index
    results.columns=['Accuracy', 'Precision', 'Recall', 'F1-Score']
    results = results.transpose()
    results.index.name="Metric"
    display(results)

    results.to_csv(f"./../results/comparison_with_popular_early_stop_{dataset_name}.csv")