### Classifiers and Regressors

#### Classifiers and Regressors help cells

In [1]:
import pickle
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import KFold
from sklearn.inspection import permutation_importance
import numpy as np

In [2]:
# datasets: list of datasets or single dataset, depending on if multiple_datasets is set to True
# make_categorical_data: transform numeric -1, 0, 1 data to categorical data, for each dimension 2 new ones, 
#       one with either SIMP/NOT_SIMP and one for ABSTAIN
# use1249LFs: use the fully unpruned dataset
def load_data(datasets, make_categorical_data, use1249LFs=False, multiple_datasets=False):
    KAT = ''
    if make_categorical_data:
        KAT = '_KAT'

    data_merged = []
    labels = []

    if not multiple_datasets:
        datasets = [datasets]

    for d_s in datasets:    
        if use1249LFs:
            simp_path = f"/workspace/datasets/__all_LFs/{d_s}-1249_simp_labels.pkl"        
            src_path = f"/workspace/datasets/__all_LFs/{d_s}-1249_src_labels.pkl"  
        else:
            simp_path = f"/workspace/datasets/ds_labels/{d_s}_simp_labels.pkl"        
            src_path = f"/workspace/datasets/ds_labels/{d_s}_src_labels.pkl"  

        simp_labels = pickle.load(open(simp_path, "rb"))
        src_labels = pickle.load(open(src_path, "rb")) 

        for entry in simp_labels:
            if make_categorical_data:
                new_ent = []
                for e in entry:
                    if e == -1:
                        new_ent.append(0)
                        new_ent.append(0)

                    if e == 0:
                        new_ent.append(1)
                        new_ent.append(0)

                    if e == 1:
                        new_ent.append(0)
                        new_ent.append(1)
                    
                data_merged.append(new_ent)
            else:
                data_merged.append(entry.tolist())

        for entry in src_labels:
            if make_categorical_data:
                new_ent = []
                for e in entry:
                    if e == -1:
                        new_ent.append(0)
                        new_ent.append(0)

                    if e == 0:
                        new_ent.append(1)
                        new_ent.append(0)

                    if e == 1:
                        new_ent.append(0)
                        new_ent.append(1)

                data_merged.append(new_ent)
            else:
                data_merged.append(entry.tolist())

        curr_lab = [0] * len(simp_labels) + [1] * len(simp_labels)
        labels = labels + curr_lab

    X, y = shuffle(data_merged, labels, random_state=42)
    return X, y, KAT

In [3]:
clf_gb = GradientBoostingClassifier(random_state=42)
clf_rf = RandomForestClassifier(random_state=42)

clfs = {'gb': clf_gb, 'rf': clf_rf}

reg_gb = GradientBoostingRegressor(random_state=42)
reg_rf = RandomForestRegressor(random_state=42)
reg_mlp = MLPRegressor(random_state=42)

regs = {'reg_gb': reg_gb, 'reg_rf': reg_rf, 'reg_mlp': reg_mlp}

In [4]:
def run_classifiers(run, X, y, run_type, n):
    if run:
        output = ''
        kfold = KFold(n_splits=n, shuffle=True, random_state=42)
        for clf in clfs:
            cv_scores = cross_val_score(clfs[clf], X, y, cv=kfold)
            output += clf + ' (' + run_type + ') : '
            output += str(sum(cv_scores)/len(cv_scores)) + '\n'
            output += str(cv_scores) + '\n___\n'
        return output, sum(cv_scores)/len(cv_scores)
    return '', 0

def run_single_classifier(X_train, y_train, clf_type, n, X_test=None, y_test=None):
    score = -1
    curr_clf = clfs[clf_type].fit(X_train, y_train)

    if X_test and y_test:
        score = curr_clf.score(X_test, y_test)
    else:
        X_test = X_train
        y_test = y_train

    # find dimensions of most important LFs
    perm_importance = permutation_importance(curr_clf, X_test, y_test).importances_mean
    if n < 0:
        return perm_importance, score
    top_LFs = np.argsort(perm_importance)[::-1][:n]

    return top_LFs, score

def run_regressors(run, X, y, n):
    if run:
        output = ''
        kfold = KFold(n_splits=n, shuffle=True, random_state=42)
        for reg in regs:
            cv_scores = cross_val_score(regs[reg], X, y, cv=kfold)
            output += reg + ': '
            output += str(sum(cv_scores)/len(cv_scores)) + '\n'
            output += str(cv_scores) + '\n___\n'
        return output, sum(cv_scores)/len(cv_scores)
    return '', 0

#### Classifiers and Regressors run

In [5]:
def run():
    datasets = ['Wiki-Manual'] #'britannica', 'ASSET', 'MTurkSF', 'Wiki-Manual','eval'

    n = 10

    X, y, KAT = load_data(datasets, make_categorical_data=True)
    print(run_classifiers(True, X, y, 'vanilla' + KAT, n)[0])
    print(run_regressors(False, X, y, n)[0])

# run()

### Feature Selection

#### Feature Selection help cells

In [6]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.inspection import permutation_importance
import numpy as np
import pandas as pd
from tqdm import tqdm

In [7]:
# naive_categorical_tuples: if using the naive_feature_selection_count and make_categorical_data and 1 dim of the 2 belonging together is chosen,
#       should the two dimensions both be considered together in the resulting dimensions
def run_naive_feature_selection(X, y, k, naive_categorical_tuples):    
    dims_to_include = []
    
    # only include dimensions where values are the most different 
    for lf_dim in range(len(X[0])):
        sum_i_src = 0
        sum_i_simp = 0
        for d_p in range(len(X)):
            if y[d_p] == 0:
                sum_i_simp += X[d_p][lf_dim]
            else:
                sum_i_src += X[d_p][lf_dim]

        dist = abs(sum_i_simp - sum_i_src)

        if len(dims_to_include) < k:
            dims_to_include.append((lf_dim, dist))
        else:
            replace_cand_dist = -1
            replace_cand_dim = -1
            # find dim with lowest dist
            for inc_dim in range(len(dims_to_include)):
                if dims_to_include[inc_dim][1] < dist and (replace_cand_dist == -1 or replace_cand_dist > dims_to_include[inc_dim][1]):
                    replace_cand_dim = inc_dim
                    replace_cand_dist = dims_to_include[inc_dim][1]
                        
            # replace it
            if replace_cand_dim > -1:
                dims_to_include[replace_cand_dim] = (lf_dim, dist)

    X_naive = []
    for d_p in range(len(X)):
        new_dp = []
        all_bases = []
        for inc_dims in dims_to_include:
            if naive_categorical_tuples:
                # check if dim has already been included
                base = int(inc_dims[0] / 2)
                if base not in all_bases:
                    all_bases.append(base)
                    base_dim = 2 * base

                    new_dp.append(X[d_p][base_dim])
                    new_dp.append(X[d_p][base_dim + 1])
            else:
                new_dp.append(X[d_p][inc_dims[0]])
        X_naive.append(new_dp)
    return X_naive

def run_chi2(X, y, k):
    X_chi2 = SelectKBest(chi2, k=k).fit_transform(X, y)
    return X_chi2

def run_mean_importance(X, y, k, n):
    kfold = KFold(n_splits=n, shuffle=True, random_state=42)

    feature_importances = []

    for train_index, test_index in kfold.split(X):
        X_train, X_test = np.take(X, train_index, axis=0), np.take(X, test_index, axis=0)
        y_train, y_test = np.take(y, train_index), np.take(y, test_index)

        clf_gb.fit(X_train, y_train)

        perm_importance = permutation_importance(clf_gb, X_test, y_test)
        feature_importances.append(perm_importance.importances_mean)

    mean_importance = np.mean(feature_importances, axis=0)
    sorted_indices = np.argsort(mean_importance)[::-1]
    top_k_indices = sorted_indices[:k]

    X_mean_importance = []

    for dp in X:
        new_dp = []
        for dim in top_k_indices:
            new_dp.append(dp[dim])
        X_mean_importance.append(new_dp)
    return X_mean_importance

def run_random(X, y, k):
    dims_to_include = shuffle(range(0, len(X[0]) - 1), random_state=42)[:k]

    X_random = []

    for d_p in range(len(X)):
        new_dp = []
        for inc_dims in dims_to_include:
            new_dp.append(X[d_p][inc_dims])
        X_random.append(new_dp)
    return X_random

#### Feature Selection run

In [8]:
def run_feature_selection(X, y, lower=10, upper=-1):
    if upper == -1:
        upper = len(X[0])-1
    all_res = []

    for i in tqdm(range(lower, upper, 10)):
        num_cat = i
        
        if len(X[0]) < num_cat:
            print('Error: Number of categories smaller than dimensions of input data!')
            exit()

        X_naive = run_naive_feature_selection(X, y, num_cat, False)
        X_chi2 = run_chi2(X, y, num_cat)
        X_mean_importance = run_mean_importance(X, y, num_cat, n)
        X_random = run_random(X, y, num_cat)

        naive_score = run_classifiers(True, X_naive, y, 'naive' + KAT + '_' + str(num_cat))[1]
        chi_score = run_classifiers(True, X_chi2, y, 'chi2' + KAT + '_' + str(num_cat))[1]
        meanImp_score = run_classifiers(True, X_mean_importance, y, 'mean importance' + KAT + '_' + str(num_cat))[1]
        rand_score = run_classifiers(True, X_random, y, 'random' + KAT + '_' + str(num_cat))[1]

        all_res.append([naive_score, chi_score, meanImp_score, rand_score])
    return all_res, lower, upper

#### Feature Selection Visualisation

In [9]:
!pip install seaborn

[0m

In [10]:
import pickle as pkl
import seaborn as sns
from scipy import stats

In [11]:
def safe_visualisation(datasets, newly_generated = False):
    if newly_generated:
        with open('/workspace/datasets/performance_of_features/' + datasets[0] + '_performance_of_feature_selection.pkl', 'wb') as f:
            pkl.dump(all_res, f)
    else:
        all_res = pd.read_pickle('/workspace/datasets/performance_of_features/' + datasets[0] + '_performance_of_feature_selection.pkl')

def run_visualisation(all_res, datasets, X, lower=10, upper=-1):
    if upper == -1:
        upper = len(X[0])-1
    df = pd.DataFrame(all_res, index=range(lower, upper, 10), columns=['naive approach', 'chi2', 'mean importance', 'random'])
    sns.lineplot(data=df).set(title=datasets[0], xlabel='Number of Dimensions', ylabel='Mean Accuracy')

### Find most important LF per Dataset

In [12]:
datasets = [] #['MTurkSF']#, 'simpa']
top_dims_single = []

for ds in datasets:
    X, y, KAT = load_data(ds, make_categorical_data=False, use1249LFs=True)
    topLFs, score = run_single_classifier(X, y, 'gb', -1)
    top_dims_single.append(topLFs)

#### Rank correlation

In [13]:
datasets = ['BenchLS', 'britannica', 'HutSSF', 'MTurkSF', 'NNSeval', 'SemEval_2007', 'simpa', 'Wiki-Manual']

top_LFs_of_all_ds = {}

for a in datasets:
    X, y, KAT = load_data(a, make_categorical_data=False, use1249LFs=True)
    topLFs, score = run_single_classifier(X, y, 'gb', -1)
    top_LFs_of_all_ds[a] = topLFs

In [29]:
datasets = ['BenchLS', 'britannica', 'HutSSF', 'MTurkSF', 'NNSeval', 'SemEval_2007', 'simpa', 'Wiki-Manual']

for a in range(len(datasets)):
    for b in range(a + 1, len(datasets)):
        res = stats.spearmanr(top_LFs_of_all_ds[datasets[a]], top_LFs_of_all_ds[datasets[b]])
        print(datasets[a] + ' - ' + datasets[b] + ': ' +  str(res.statistic))

BenchLS - britannica: 0.09527422297408623
BenchLS - HutSSF: 0.08275683422753065
BenchLS - MTurkSF: 0.0506039115514327
BenchLS - NNSeval: 0.13781133334526752
BenchLS - SemEval_2007: 0.08283694607813029
BenchLS - simpa: 0.05598994453154044
BenchLS - Wiki-Manual: 0.15029814476386447
britannica - HutSSF: 0.04346991253273278
britannica - MTurkSF: 0.02312911623051797
britannica - NNSeval: 0.1025417136612242
britannica - SemEval_2007: 0.1398111574441977
britannica - simpa: -0.00031042505497715805
britannica - Wiki-Manual: 0.09305242859664895
HutSSF - MTurkSF: 0.028638641619123252
HutSSF - NNSeval: 0.10813306354848201
HutSSF - SemEval_2007: 0.031059593953880114
HutSSF - simpa: 0.10674640091424398
HutSSF - Wiki-Manual: 0.0814626333624733
MTurkSF - NNSeval: 0.10306168697322096
MTurkSF - SemEval_2007: 0.09918796111681293
MTurkSF - simpa: 0.03798865624695228
MTurkSF - Wiki-Manual: 0.041986485782330614
NNSeval - SemEval_2007: 0.07179203900812234
NNSeval - simpa: 0.13675940876637363
NNSeval - Wiki-M

In [15]:
datasets = [] #[['MTurkSF', 'simpa'], ['simpa', 'MTurkSF']]
top_dims_merge = []
scores = []

for ds in datasets:
    X, y, KAT = load_data(ds[0], make_categorical_data=False, use1249LFs=True)
    X2, y2, KAT = load_data(ds[1], make_categorical_data=False, use1249LFs=True)
    topLFs, score = run_single_classifier(X, y, 'gb', -1, X2, y2)
    top_dims_merge.append(topLFs)
    scores.append(score)

In [16]:
top_dims_merge

[]

In [17]:
scores

[]

In [18]:
# todo: Verkn√ºpfung von Dimensionen zu LFs