### Classifiers and Regressors

#### Classifiers and Regressors help cells

In [1]:
import pickle
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import KFold
from sklearn.inspection import permutation_importance
import numpy as np

In [2]:
# datasets: list of datasets or single dataset, depending on if multiple_datasets is set to True
# make_categorical_data: transform numeric -1, 0, 1 data to categorical data, for each dimension 2 new ones, 
#       one with either SIMP/NOT_SIMP and one for ABSTAIN
# use1249LFs: use the fully unpruned dataset
def load_data(datasets, make_categorical_data, use1249LFs=False, multiple_datasets=False):
    KAT = ''
    if make_categorical_data:
        KAT = '_KAT'

    data_merged = []
    labels = []

    if not multiple_datasets:
        datasets = [datasets]

    for d_s in datasets:    
        if use1249LFs:
            simp_path = f"/workspace/datasets/__all_LFs/{d_s}-1249_simp_labels.pkl"        
            src_path = f"/workspace/datasets/__all_LFs/{d_s}-1249_src_labels.pkl"  
        else:
            simp_path = f"/workspace/datasets/ds_labels/{d_s}_simp_labels.pkl"        
            src_path = f"/workspace/datasets/ds_labels/{d_s}_src_labels.pkl"  

        simp_labels = pickle.load(open(simp_path, "rb"))
        src_labels = pickle.load(open(src_path, "rb")) 

        for entry in simp_labels:
            if make_categorical_data:
                new_ent = []
                for e in entry:
                    if e == -1:
                        new_ent.append(0)
                        new_ent.append(0)

                    if e == 0:
                        new_ent.append(1)
                        new_ent.append(0)

                    if e == 1:
                        new_ent.append(0)
                        new_ent.append(1)
                    
                data_merged.append(new_ent)
            else:
                #data_merged.append()
                data_merged.append([abs(number) for number in entry.tolist()])

        for entry in src_labels:
            if make_categorical_data:
                new_ent = []
                for e in entry:
                    if e == -1:
                        new_ent.append(0)
                        new_ent.append(0)

                    if e == 0:
                        new_ent.append(1)
                        new_ent.append(0)

                    if e == 1:
                        new_ent.append(0)
                        new_ent.append(1)

                data_merged.append(new_ent)
            else:
                #data_merged.append(entry.tolist())
                data_merged.append([abs(number) for number in entry.tolist()])


        curr_lab = [0] * len(simp_labels) + [1] * len(simp_labels)
        labels = labels + curr_lab

    X, y = shuffle(data_merged, labels, random_state=42)
    return X, y, KAT

In [3]:
clf_gb = GradientBoostingClassifier(random_state=42)
clf_rf = RandomForestClassifier(random_state=42)

clfs = {'gb': clf_gb, 'rf': clf_rf}

reg_gb = GradientBoostingRegressor(random_state=42)
reg_rf = RandomForestRegressor(random_state=42)
reg_mlp = MLPRegressor(random_state=42)

#regs = {'reg_gb': reg_gb, 'reg_rf': reg_rf, 'reg_mlp': reg_mlp}
regs = {'reg_rf': reg_rf}

In [4]:
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error


In [41]:
def reg_scoring(y_true, y_pred):
    thres_y_pred = np.floor(y_pred + 0.5)

    # Calculate the mean of the true values
    #mean_y_true = np.mean(y_true)
 
    # Calculate the sum of squares of residuals and total sum of squares
    #ss_res = np.sum((y_true - thres_y_pred) ** 2)
    #ss_tot = np.sum((y_true - mean_y_true) ** 2)
 
    # Calculate R²
    #r2 = 1 - (ss_res / ss_tot)
 
    rms = mean_squared_error(y_true, thres_y_pred, squared=False)

    return rms

def run_reg(run, X, y, run_type, n=10):
    if run:
        reg_score = make_scorer(reg_scoring)

        output = ''
        kfold = KFold(n_splits=n, shuffle=True, random_state=42)
        for clf in regs:
            cv_scores = cross_val_score(regs[clf], X, y, cv=kfold, scoring='neg_mean_absolute_error')

            #cv_scores = cross_val_score(regs[clf], X, y, cv=kfold, scoring=reg_score)

            output += clf + ' (' + run_type + ') : '
            output += str(sum(cv_scores)/len(cv_scores)) + '\n'
            output += str(cv_scores) + '\n___\n'
        return output, sum(cv_scores)/len(cv_scores)
    return '', 0

#### Classifiers and Regressors run

In [43]:
def run():
    datasets = 'MTurkSF' #'britannica', 'ASSET', 'MTurkSF', 'Wiki-Manual','eval'

    n = 10

    X, y, KAT = load_data(datasets, make_categorical_data=False, use1249LFs=True)
    print(run_reg(True, X, y, 'vanilla' + KAT, n)[0])

run()

reg_rf (vanilla) : -0.2964152559192787
[-0.29351328 -0.30814178 -0.26473652 -0.24387825 -0.28358206 -0.31884504
 -0.25424662 -0.32070644 -0.35622814 -0.32027442]
___

