In [1]:
import pickle
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import KFold

In [2]:
def load_data(only_few_dims):
    data_merged = []
    labels = []

    for d_s in ['MTurkSF', 'Wiki-Manual']: #,  'britannica', 'ASSET', 'eval'
        simp_path = f"/workspace/datasets/ds_labels/{d_s}_simp_labels.pkl"        
        src_path = f"/workspace/datasets/ds_labels/{d_s}_src_labels.pkl"  

        simp_labels = pickle.load(open(simp_path, "rb"))
        src_labels = pickle.load(open(src_path, "rb")) 

        print(len(simp_labels[0]))
        print(len(src_labels[0]))

        for entry in simp_labels:
            data_merged.append(entry.tolist())

        for entry in src_labels:
            data_merged.append(entry.tolist())

        curr_lab = [0] * len(simp_labels) + [1] * len(simp_labels)
        labels = labels + curr_lab

    if only_few_dims > 0:
        dims_to_include = []
        
        # only include dimensions where values are the most different 
        for lf_dim in range(len(simp_labels[0])):
            sum_i_src = 0
            sum_i_simp = 0
            for d_p in range(len(data_merged)):
                if labels[d_p] == 0:
                    sum_i_simp += data_merged[d_p][lf_dim]
                else:
                    sum_i_src += data_merged[d_p][lf_dim]

            dist = abs(sum_i_simp - sum_i_src)

            if len(dims_to_include) < only_few_dims:
                dims_to_include.append((lf_dim, dist))
            else:
                replace_cand_dist = -1
                replace_cand_dim = -1
                # find dim with lowest dist
                for inc_dim in range(len(dims_to_include)):
                    if dims_to_include[inc_dim][1] < dist and (replace_cand_dist == -1 or replace_cand_dist > dims_to_include[inc_dim][1]):
                        replace_cand_dim = inc_dim
                        replace_cand_dist = dims_to_include[inc_dim][1]
                            
                # replace it
                if replace_cand_dim > -1:
                    dims_to_include[replace_cand_dim] = (lf_dim, dist)

        new_data = []
        for d_p in range(len(data_merged)):
            new_dp = []
            for inc_dims in dims_to_include:
                new_dp.append(data_merged[d_p][inc_dims[0]])
            new_data.append(new_dp)

        data_merged = new_data                    

    print('----')
    print(len(data_merged))
    print(len(labels))
    X, y = shuffle(data_merged, labels, random_state=42)
    return X, y

In [3]:
X, y = load_data(10)

328
328
328
328
----
2550
2550


In [4]:
clf_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.01, random_state=42)
clf_rf = RandomForestClassifier(random_state=42)

clfs = {'gb': clf_gb, 'rf': clf_rf}

In [5]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
for clf in clfs:
    cv_scores = cross_val_score(clfs[clf], X, y, cv=kfold)
    print(clf)
    print(sum(cv_scores)/len(cv_scores))
    print(cv_scores)
    print('___')

gb
0.5305882352941176
[0.5372549  0.51764706 0.55686275 0.54509804 0.54509804 0.55686275
 0.49411765 0.50588235 0.5372549  0.50980392]
___
rf
0.512156862745098
[0.50980392 0.51372549 0.51372549 0.51372549 0.53333333 0.52941176
 0.49019608 0.49803922 0.51372549 0.50588235]
___


In [6]:
reg_gb = GradientBoostingRegressor(random_state=42)
reg_rf = RandomForestRegressor(random_state=42)
reg_mlp = MLPRegressor(random_state=42)

regs = {'reg_gb': reg_gb, 'reg_rf': reg_rf, 'reg_mlp': reg_mlp}

In [7]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
for reg in regs:
    cv_scores = cross_val_score(regs[reg], X, y, cv=kfold)
    print(reg)
    print(sum(cv_scores)/len(cv_scores))
    print(cv_scores)
    print('___')

reg_gb
-0.010864870567175755
[-0.01472845 -0.01104293  0.01193676 -0.00582593 -0.02000232 -0.00553896
 -0.03742129 -0.00481826  0.00069379 -0.02190112]
___
reg_rf
-0.04595905274739347
[-0.03338488 -0.03315407 -0.04928691 -0.04124212 -0.0269516  -0.0342978
 -0.0618418  -0.04096703 -0.06016164 -0.07830269]
___
reg_mlp
-0.019197059935635453
[-0.01190631 -0.05047223 -0.00388514 -0.00312347 -0.01972183 -0.00899656
 -0.03781366 -0.01562139 -0.0057205  -0.0347095 ]
___
