In [1]:
import pickle
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import KFold

In [48]:
def load_data(only_few_dims, categorical, categorical_triple):
    data_merged = []
    labels = []

    for d_s in ['MTurkSF', 'Wiki-Manual','eval']: #,  'britannica', 'ASSET',  'MTurkSF', 'Wiki-Manual',
        simp_path = f"/workspace/datasets/ds_labels/{d_s}_simp_labels.pkl"        
        src_path = f"/workspace/datasets/ds_labels/{d_s}_src_labels.pkl"  

        simp_labels = pickle.load(open(simp_path, "rb"))
        src_labels = pickle.load(open(src_path, "rb")) 

        for entry in simp_labels:
            if categorical:
                new_ent = []
                for e in entry:
                    if e == -1:
                        new_ent.append(0)
                        new_ent.append(0)
                        new_ent.append(1)

                    if e == 0:
                        new_ent.append(1)
                        new_ent.append(0)
                        new_ent.append(0)

                    if e == 1:
                        new_ent.append(0)
                        new_ent.append(1)
                        new_ent.append(0)
                    
                data_merged.append(new_ent)
            else:
                data_merged.append(entry.tolist())

        for entry in src_labels:
            if categorical:
                new_ent = []
                for e in entry:
                    if e == -1:
                        new_ent.append(0)
                        new_ent.append(0)
                        new_ent.append(1)

                    if e == 0:
                        new_ent.append(1)
                        new_ent.append(0)
                        new_ent.append(0)

                    if e == 1:
                        new_ent.append(0)
                        new_ent.append(1)
                        new_ent.append(0)

                data_merged.append(new_ent)
            else:
                data_merged.append(entry.tolist())

        curr_lab = [0] * len(simp_labels) + [1] * len(simp_labels)
        labels = labels + curr_lab

    if only_few_dims > 0:
        dims_to_include = []
        
        # only include dimensions where values are the most different 
        for lf_dim in range(len(simp_labels[0])):
            sum_i_src = 0
            sum_i_simp = 0
            for d_p in range(len(data_merged)):
                if labels[d_p] == 0:
                    sum_i_simp += data_merged[d_p][lf_dim]
                else:
                    sum_i_src += data_merged[d_p][lf_dim]

            dist = abs(sum_i_simp - sum_i_src)

            if len(dims_to_include) < only_few_dims:
                dims_to_include.append((lf_dim, dist))
            else:
                replace_cand_dist = -1
                replace_cand_dim = -1
                # find dim with lowest dist
                for inc_dim in range(len(dims_to_include)):
                    if dims_to_include[inc_dim][1] < dist and (replace_cand_dist == -1 or replace_cand_dist > dims_to_include[inc_dim][1]):
                        replace_cand_dim = inc_dim
                        replace_cand_dist = dims_to_include[inc_dim][1]
                            
                # replace it
                if replace_cand_dim > -1:
                    dims_to_include[replace_cand_dim] = (lf_dim, dist)

        new_data = []
        for d_p in range(len(data_merged)):
            new_dp = []
            all_bases = []
            for inc_dims in dims_to_include:
                if categorical_triple:
                    # check if dim has already been included:
                    base = int(inc_dims[0] / 3)
                    if base not in all_bases:
                        all_bases.append(base)
                        base_dim = 3 * base

                        new_dp.append(data_merged[d_p][base_dim])
                        new_dp.append(data_merged[d_p][base_dim + 1])
                        new_dp.append(data_merged[d_p][base_dim + 2])
                else:
                    new_dp.append(data_merged[d_p][inc_dims[0]])
            new_data.append(new_dp)

        data_merged = new_data                    

    X, y = shuffle(data_merged, labels, random_state=42)
    return X, y

In [55]:
X, y = load_data(0, False, False)

In [50]:
len(X[0])

75

In [51]:
clf_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.01, random_state=42)
clf_rf = RandomForestClassifier(random_state=42)

clfs = {'gb': clf_gb, 'rf': clf_rf}

In [52]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
for clf in clfs:
    cv_scores = cross_val_score(clfs[clf], X, y, cv=kfold)
    print(clf)
    print(sum(cv_scores)/len(cv_scores))
    print(cv_scores)
    print('___')

gb
0.5245070422535212
[0.51267606 0.52394366 0.52676056 0.52676056 0.48450704 0.53521127
 0.51830986 0.53802817 0.57746479 0.50140845]
___
rf
0.4828169014084508
[0.48169014 0.47042254 0.47605634 0.48732394 0.45915493 0.50140845
 0.47323944 0.48732394 0.52957746 0.46197183]
___


In [6]:
reg_gb = GradientBoostingRegressor(random_state=42)
reg_rf = RandomForestRegressor(random_state=42)
reg_mlp = MLPRegressor(random_state=42)

regs = {'reg_gb': reg_gb, 'reg_rf': reg_rf, 'reg_mlp': reg_mlp}

In [7]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
for reg in regs:
    cv_scores = cross_val_score(regs[reg], X, y, cv=kfold)
    print(reg)
    print(sum(cv_scores)/len(cv_scores))
    print(cv_scores)
    print('___')

reg_gb
0.020509568937202773
[ 0.0119761   0.08064076  0.01812018  0.0270464   0.00789198  0.00457563
  0.01243877  0.0351943  -0.00147565  0.00868721]
___
reg_rf
-0.2880772460199624
[-0.35880896 -0.22891003 -0.28602776 -0.2027018  -0.27265979 -0.28661845
 -0.33914744 -0.30247664 -0.28430462 -0.31911697]
___




reg_mlp
-0.5952573100621439
[-0.70978089 -0.31477472 -0.64717096 -0.58432874 -0.65126289 -0.54711554
 -0.29663461 -0.76220449 -0.81070742 -0.62859284]
___


### Feature Selection

In [14]:
from sklearn.feature_selection import SelectKBest, chi2

In [15]:
X_new = SelectKBest(chi2, k=50).fit_transform(X, y)

In [16]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
for clf in clfs:
    cv_scores = cross_val_score(clfs[clf], X_new, y, cv=kfold)
    print(clf)
    print(sum(cv_scores)/len(cv_scores))
    print(cv_scores)
    print('___')

gb
0.608
[0.59 0.59 0.64 0.58 0.66 0.62 0.62 0.55 0.67 0.56]
___
rf
0.6010000000000001
[0.6  0.58 0.61 0.57 0.65 0.6  0.63 0.55 0.65 0.57]
___


In [59]:
from sklearn.inspection import permutation_importance
import numpy as np

In [63]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

feature_importances = []

for train_index, test_index in kfold.split(X):
    X_train, X_test = np.take(X, train_index, axis=0), np.take(X, test_index, axis=0)
    y_train, y_test = np.take(y, train_index), np.take(y, test_index)

    clf_gb.fit(X_train, y_train)

    perm_importance = permutation_importance(clf_gb, X_test, y_test)
    feature_importances.append(perm_importance.importances_mean)

mean_importance = np.mean(feature_importances, axis=0)

In [68]:
sorted_indices = np.argsort(mean_importance)[::-1]
top_50_indices = sorted_indices[:50]

In [69]:
reduced_X = []

for dp in X:
    new_dp = []
    for dim in top_50_indices:
        new_dp.append(dp[dim])
    reduced_X.append(new_dp)

3550
50


In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
for clf in clfs:
    cv_scores = cross_val_score(clfs[clf], X_reduced, y, cv=kfold)
    print(clf)
    print(sum(cv_scores)/len(cv_scores))
    print(cv_scores)
    print('___')