### RQ 3
#### Read in ARTS datasets

In [None]:
import pickle
import pandas as pd
import numpy as np
import math
from scipy import stats
from matcher import get_mapping_of_LF_and_dims
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, max_error

In [None]:
def load_ARTS_data(dims):
    data = pickle.load(open(f'/workspace/vectors/ARTS_' + str(dims) + '_vectors.pkl', "rb"))

    return data[0], data[1], data[2]

In [None]:
arts94_X, arts94_scores, arts94_embeddings = load_ARTS_data(94)
arts300_X, arts300_scores, arts300_embeddings = load_ARTS_data(300)
arts3000_X, arts3000_scores, arts3000_embeddings = load_ARTS_data(3000)

#### Prepare structures containing information on relation of parametrizations, interpretations, features, categories - we only require the information of dimensions to names and names to features

In [None]:
dims_to_names, name_to_dims_UNUSED, name_to_interpretation_UNUSED, name_to_category_UNUSED, name_to_feature = get_mapping_of_LF_and_dims()

In [None]:
names_to_comp_simp = {}

df_cs = pd.DataFrame(arts3000_X)

for dim in range(len(arts3000_X[0])):
    cnts = df_cs[dim].value_counts()

    name = dims_to_names[dim][0]

    if 0 in cnts:
        names_to_comp_simp[name] = 'simp'
    else:
        if 1 in cnts:
            names_to_comp_simp[name] = 'complex'
        else:
            if 'NOT_SIMPLE' in name:
                names_to_comp_simp[name] = 'complex'
            else:
                if 'SIMPLE' in name:
                    names_to_comp_simp[name] = 'simp'
                else:
                    if 'label=1' in name:
                        names_to_comp_simp[name] = 'complex'
                    else:
                        if 'label=0' in name:
                            names_to_comp_simp[name] = 'simp'
                        else:
                            if dim == 228 or dim == 229 or dim == 230 or dim == 555 or dim == 556 or dim == 557 or dim == 558 or dim == 559 or dim == 560 or dim == 561 or dim == 562 or \
                                dim == 563 or dim == 564 or dim == 565 or dim == 566 or dim == 567 or dim == 568 or dim == 569 or dim == 570 or dim == 571 or dim == 571 or dim == 572 or \
                                dim == 573 or dim == 574 or dim == 990:
                                names_to_comp_simp[name] = 'simp'
                            else:
                                if dim == 985 or dim == 991 or dim == 992 or dim == 993 or dim == 994 or dim == 995 or dim == 1001 or dim == 1002 or dim == 1004 or dim == 1005:
                                    names_to_comp_simp[name] = 'complex'
                                else:
                                    print(dim)
                                    print(name)

cnt_feat = {}
cnt_cat = {}

comp_name_to_feature = {}
simp_name_to_feature = {}

for name in name_to_feature:
    feat = name_to_feature[name]
    if names_to_comp_simp[name] == 'simp':
        simp_name_to_feature[name] = feat
    else:
        comp_name_to_feature[name] = feat


#### Remap values in X so 1 indicates a characteristic has been encountered and 0 means abstain

In [None]:
def remap(ds):
    arts_X_remapped = []

    for entry in ds:
        vals = []

        for n in entry:
            if n == 0: # simple
                vals.append(1)
            else:
                if n == -1: # abstain 
                    vals.append(0)
                else:
                    if n == 1: # complex
                        vals.append(1)

        arts_X_remapped.append(vals)
        
    return arts_X_remapped

In [None]:
arts3000_X_remapped = remap(arts3000_X)
arts300_X_remapped = remap(arts300_X)
arts94_X_remapped = remap(arts94_X)

#### Create structures containing summed up values for dimensions per representation

In [None]:
dim_to_name_simp = {}
dim_to_name_comp = {}

In [None]:
def split_ds_by_sc(ds):
    para_to_dim_simp = {}
    para_to_dim_comp = {}

    for i in range(len(ds[0])):
        name = dims_to_names[i][0].strip()
        sc = names_to_comp_simp[name]

        if sc == 'simp':
            if not name in para_to_dim_simp:
                dim_to_name_simp[len(para_to_dim_simp)] = name
                para_to_dim_simp[name] = len(para_to_dim_simp)
        else: 
            if not name in para_to_dim_comp:
                dim_to_name_comp[len(para_to_dim_comp)] = name
                para_to_dim_comp[name] = len(para_to_dim_comp)

    all_para_s = []
    all_para_c = []

    for i in range(len(ds)):
        all_para_s.append([0] * len(simp_name_to_feature))
        all_para_c.append([0] * len(comp_name_to_feature))

    # fill structures
    for i in range(len(ds)):
        for j in range(len(ds[i])):
            val = ds[i][j]
            name = dims_to_names[j][0]
            sc = names_to_comp_simp[name]

            if sc == 'simp':
                all_para_s[i][para_to_dim_simp[name]] = val
            else:
                all_para_c[i][para_to_dim_comp[name]] = val
            
    all_para_s = pd.DataFrame(all_para_s)
    all_para_c = pd.DataFrame(all_para_c)

    return all_para_s, all_para_c

In [None]:
all_para_s_94, all_para_c_94 = split_ds_by_sc(arts94_X_remapped)
all_para_s_300, all_para_c_300 = split_ds_by_sc(arts300_X_remapped)
all_para_s_3000, all_para_c_3000 = split_ds_by_sc(arts3000_X_remapped)

#### Identify dimensions to be kept (based on "high" correlation of ARTS score and different dimensions from vector representation)

In [None]:
def calc_cors(arts_ds_s, arts_ds_c, scores_ds):
        
    final_cor_s = []

    for i in arts_ds_s.columns:
        cor = stats.pearsonr(arts_ds_s[i], scores_ds)[0]
        if not math.isnan(cor):
            final_cor_s.append((cor, i))
        else:
            final_cor_s.append((0, i))

    final_cor_c = []

    for i in arts_ds_c.columns:
        cor = stats.pearsonr(arts_ds_c[i], scores_ds)[0]
        if not math.isnan(cor):
            final_cor_c.append((cor, i))
        else:
            final_cor_c.append((0, i))
    
    return final_cor_s, final_cor_c

In [None]:
final_cor_s_reduce_3000, final_cor_c_reduce_3000 = calc_cors(all_para_s_3000, all_para_c_3000, arts3000_scores)

kept_names = []
kept_names_s = []
kept_dim_s = []

for tuple in final_cor_s_reduce_3000:
    if tuple[0] <= -0.25:
        kept_dim_s.append(tuple[1])
        kept_names_s.append(dim_to_name_simp[tuple[1]])
        kept_names.append(dim_to_name_simp[tuple[1]])

kept_names_c = []
kept_dim_c = []

for tuple in final_cor_c_reduce_3000:
    if tuple[0] >= 0.25:
        kept_dim_c.append(tuple[1])
        kept_names_c.append(dim_to_name_comp[tuple[1]])
        kept_names.append(dim_to_name_comp[tuple[1]])

#### Generate BATS vectors of ARTS datasets (to only contain the kept dimensions) 

In [None]:
def reduce_to_kept_dims(cur_s, cur_c):
    kept = []
    for i, r in cur_s.iterrows():
        vars = []
        for col in cur_s.columns:
            if col in kept_dim_s:
                vars.append(cur_s[col][i])
        for col in cur_c.columns:
            if col in kept_dim_c:
                vars.append(cur_c[col][i])
        kept.append(vars)
    
    return kept

In [None]:
kept_94 = reduce_to_kept_dims(all_para_s_94, all_para_c_94)
kept_300 = reduce_to_kept_dims(all_para_s_300, all_para_c_300)
kept_3000 = reduce_to_kept_dims(all_para_s_3000, all_para_c_3000)

#### Regression

In [None]:
reg_gb = GradientBoostingRegressor(random_state=42)
reg_rf = RandomForestRegressor(random_state=42)

regs = {'reg_gb': reg_gb, 'reg_rf': reg_rf}

In [None]:
# settings
training_dim = 3000
pred_dim = 300
vector = 'GPT' # 'BATS' or 'GPT'
regressor = reg_gb # reg_gb OR reg_rf

# train regressor
if vector == 'BATS':
    if training_dim == 94:
        r = regressor.fit(kept_94, arts94_scores)
    if training_dim == 300:
        r = regressor.fit(kept_300, arts300_scores)
    if training_dim == 3000:
        r = regressor.fit(kept_3000, arts3000_scores)
if vector == 'GPT':
    if training_dim == 94:
        r = regressor.fit(arts94_embeddings, arts94_scores)
    if training_dim == 300:
        r = regressor.fit(arts300_embeddings, arts300_scores)
    if training_dim == 3000:
        r = regressor.fit(arts3000_embeddings, arts3000_scores)

In [None]:
# predict scores
if vector == 'BATS':
    if pred_dim == 94:
        pred = r.predict(kept_94)
        comp = arts94_scores
    if pred_dim == 300:
        pred = r.predict(kept_300)
        comp = arts300_scores
    if pred_dim == 3000:
        pred = r.predict(kept_3000)
        comp = arts3000_scores
if vector == 'GPT':
    if pred_dim == 94:
        pred = r.predict(arts94_embeddings)
        comp = arts94_scores
    if pred_dim == 300:
        pred = r.predict(arts300_embeddings)
        comp = arts300_scores
    if pred_dim == 3000:
        pred = r.predict(arts3000_embeddings)
        comp = arts3000_scores

# evaluation measures: MSE and R2
print(round(mean_squared_error(comp, pred), 3))
print(round(r2_score(comp, pred), 3))