### Classifiers and Regressors

#### Classifiers and Regressors help cells

In [1]:
import pickle
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import KFold
from sklearn.inspection import permutation_importance
import numpy as np
import pandas as pd

In [2]:
# datasets: list of datasets or single dataset, depending on if multiple_datasets is set to True
# make_categorical_data: transform numeric -1, 0, 1 data to categorical data, for each dimension 2 new ones, 
#       one with either SIMP/NOT_SIMP and one for ABSTAIN
# use1249LFs: use the fully unpruned dataset
def load_data(datasets, make_categorical_data, use1249LFs=False, multiple_datasets=False):
    data_merged = []
    labels = []

    if not multiple_datasets:
        datasets = [datasets]

    for d_s in datasets:    
        if use1249LFs:
            simp_path = f"/workspace/datasets/ds_labels/{d_s}-1249_simp_labels.pkl"        
            src_path = f"/workspace/datasets/ds_labels/{d_s}-1249_src_labels.pkl"  
        else:
            simp_path = f"/workspace/datasets/ds_labels/{d_s}_simp_labels.pkl"        
            src_path = f"/workspace/datasets/ds_labels/{d_s}_src_labels.pkl"  

        simp_labels = pickle.load(open(simp_path, "rb"))
        src_labels = pickle.load(open(src_path, "rb")) 

        for entry in simp_labels:
            data_merged.append([number for number in entry.tolist()])

        for entry in src_labels:
            data_merged.append([number for number in entry.tolist()])

        curr_lab = [0] * len(simp_labels) + [1] * len(simp_labels)
        labels = labels + curr_lab

    # load textual data
    textual_dataset = pd.read_pickle('/workspace/datasets/' + d_s + '/' + d_s + '.pkl')
    
    dups_simp = textual_dataset['simp'].duplicated()
    dups_src = textual_dataset['src'].duplicated()

    num_left_simp = len(dups_simp) - sum(dups_simp)
    num_left_src= len(dups_src) - sum(dups_src)
    
    num_left = min(num_left_simp, num_left_src)


    data_merged_simp_dedup = []
    data_merged_src_dedup = []

    labels_dedup = []

    for index, val in dups_simp.items():
        if val == False:
            data_merged_simp_dedup.append(data_merged[index])

    data_merged_simp_dedup = shuffle(data_merged_simp_dedup, random_state=42)[:num_left]

    for index, val in dups_src.items():
        if val == False:
            data_merged_src_dedup.append(data_merged[len(dups_src) + index])

    data_merged_src_dedup = shuffle(data_merged_src_dedup, random_state=42)[:num_left]

    labels_dedup = [0] * num_left + [1] * num_left

    X, y = shuffle(data_merged_simp_dedup + data_merged_src_dedup, labels_dedup, random_state=42)

    return X, y

In [3]:
def load_ARTS_data(dims):
    X = []
    y = []

    if dims == 94:
        path_X = f"/workspace/datasets/ds_labels/arts94-1249_labels.pkl"        
        path_y = f"/workspace/datasets/ds_labels/Gold_Scores.pkl"        
    if dims == 300:
        path_X = f"/workspace/datasets/ds_labels/arts300-1249_labels.pkl"        
        path_y = f"/workspace/datasets/ds_labels/gpt300_Scores.pkl"        

    X_labels = pickle.load(open(path_X, "rb"))
    y_labels = pickle.load(open(path_y, "rb"))

    for entry in X_labels:
        X.append([number for number in entry.tolist()])
    
    for i in range(len(X)):
        y.append(y_labels[i][0])

    return X, y

In [4]:
import random

In [9]:
# datasets: list of datasets or single dataset, depending on if multiple_datasets is set to True
# make_categorical_data: transform numeric -1, 0, 1 data to categorical data, for each dimension 2 new ones, 
#       one with either SIMP/NOT_SIMP and one for ABSTAIN
# use1249LFs: use the fully unpruned dataset
def load_200_data(datasets, make_categorical_data, use1249LFs=False, multiple_datasets=False):
    if not multiple_datasets:
        datasets = [datasets]

    data_merged_dedup = []
    labels_dedup = []


    for d_s in datasets:    
        data_merged_simp_dedup = []
        data_merged_src_dedup = []

        data_merged_simp = []
        data_merged_src = []
        
        if use1249LFs:
            simp_path = f"/workspace/datasets/ds_labels/{d_s}-1249_simp_labels.pkl"        
            src_path = f"/workspace/datasets/ds_labels/{d_s}-1249_src_labels.pkl"  
        else:
            simp_path = f"/workspace/datasets/ds_labels/{d_s}_simp_labels.pkl"        
            src_path = f"/workspace/datasets/ds_labels/{d_s}_src_labels.pkl"  

        random.seed(42)
        simp_labels = pickle.load(open(simp_path, "rb"))
        src_labels = pickle.load(open(src_path, "rb"))

        for entry in simp_labels:
            data_merged_simp.append([number for number in entry.tolist()])

        for entry in src_labels:
            data_merged_src.append([number for number in entry.tolist()])

        labels_dedup += [0] * 200 + [1] * 200

        # load textual data
        textual_dataset = pd.read_pickle('/workspace/datasets/' + d_s + '/' + d_s + '.pkl')
        
        dups_simp = textual_dataset['simp'].duplicated()
        dups_src = textual_dataset['src'].duplicated()

        for index, val in dups_simp.items():
            if val == False:
                data_merged_simp_dedup.append(data_merged_simp[index])

        for index, val in dups_src.items():
            if val == False:
                data_merged_src_dedup.append(data_merged_src[index])

        data_merged_dedup += shuffle(data_merged_simp_dedup, random_state=42)[:200] + shuffle(data_merged_src_dedup, random_state=42)[:200]

    X, y = shuffle(data_merged_dedup, labels_dedup, random_state=42)

    return X, y

In [10]:
clf_gb = GradientBoostingClassifier(random_state=42)
clf_rf = RandomForestClassifier(random_state=42)

clfs = {'gb': clf_gb, 'rf': clf_rf}

reg_gb = GradientBoostingRegressor(random_state=42)
reg_rf = RandomForestRegressor(random_state=42)
reg_mlp = MLPRegressor(random_state=42)

#regs = {'reg_gb': reg_gb, 'reg_rf': reg_rf, 'reg_mlp': reg_mlp}
regs = {'reg_rf': reg_rf}

In [11]:
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

In [23]:
def reg_scoring(y_true, y_pred):
    thres_y_pred = np.floor(y_pred + (1-0.5812))

    # Calculate the mean of the true values
    #mean_y_true = np.mean(y_true)
 
    # Calculate the sum of squares of residuals and total sum of squares
    #ss_res = np.sum((y_true - thres_y_pred) ** 2)
    #ss_tot = np.sum((y_true - mean_y_true) ** 2)
 
    # Calculate R²
    #r2 = 1 - (ss_res / ss_tot)
 
    ac = accuracy_score(y_true, thres_y_pred)

    return ac

def run_reg(run, X, y, run_type, n=10):
    if run:
        reg_score = make_scorer(reg_scoring)

        output = ''
        kfold = KFold(n_splits=n, shuffle=True, random_state=42)
        for clf in regs:
            #cv_scores = cross_val_score(regs[clf], X, y, cv=kfold, scoring='neg_mean_absolute_error')

            cv_scores = cross_val_score(regs[clf], X, y, cv=kfold, scoring=reg_score)

            output += clf + ' (' + run_type + ') : '
            output += str(sum(cv_scores)/len(cv_scores)) + '\n'
            output += str(cv_scores) + '\n___\n'
        return output, sum(cv_scores)/len(cv_scores)
    return '', 0

#### Classifiers and Regressors run

In [13]:
from sklearn.model_selection import cross_val_predict


In [30]:
def run():
    #datasets = ['ASSET', 'AutoMeTS', 'BenchLS', 'britannica', 'EW-SEW-Turk', 'HutSSF', 
    #            'metaeval', 'MTurkSF', 'NNSeval', 'OneStopEnglish', 'QuestEval', 'SemEval_2007',
    #            'SimpEval_22', 'simpa', 'TurkCorpus']#['BenchLS', 'britannica', 'MTurkSF', 'NNSeval', 'SemEval_2007'] #'britannica', 'ASSET', 'MTurkSF', 'Wiki-Manual','eval'

    datasets = ['simpa'] # 'TurkCorpus'

    n = 10

    #X, y = load_data(datasets, make_categorical_data=False, use1249LFs=True, multiple_datasets=True)
    X, y = load_data(datasets, make_categorical_data=False, use1249LFs=True, multiple_datasets=True)

    print(len(X))

    avg_scores = False
    if avg_scores:
        scores = cross_val_predict(regs['reg_rf'], X, y, cv=10)
        print(len(scores))
        print(sum(scores)/len(scores))
        
        cnts = 0
        cntc = 0
        for i in range(len(y)):
            if y[i] == 0:
                cnts += scores[i]
            else:
                cntc += scores[i]

        print(cnts/(len(scores)/2))
        print(cntc/(len(scores)/2))

    else:
        print(run_reg(True, X, y, 'vanilla', n)[0])

run()

13200


KeyboardInterrupt: 

In [16]:
# train one model and pkl it
from sklearn.metrics import mean_squared_error


datasets = ['ASSET', 'AutoMeTS', 'BenchLS', 'britannica', 'EW-SEW-Turk', 'HutSSF', 
                'metaeval', 'MTurkSF', 'NNSeval', 'OneStopEnglish', 'QuestEval', 'SemEval_2007',
                'SimpEval_22', 'simpa', 'TurkCorpus']#['BenchLS', 'britannica', 'MTurkSF', 'NNSeval', 'SemEval_2007'] #'britannica', 'ASSET', 'MTurkSF', 'Wiki-Manual','eval'

X, y = load_200_data(datasets, make_categorical_data=False, use1249LFs=True, multiple_datasets=True)



splitperc = int(len(X)*0.8)

r = reg_rf.fit(X[:splitperc], y[:splitperc])

In [14]:
import pickle as pkl
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
with open("/workspace/datasets/model_200.pkl", "wb") as output_file:
   pkl.dump(r, output_file)

NameError: name 'r' is not defined

In [None]:
pred = r.predict(X[splitperc:])

In [None]:
msw = reg_scoring(y[splitperc:], pred)
print(msw)

0.795


In [None]:
import pickle as pkl

In [15]:
with open("/workspace/datasets/ds_labels/gpt300_Scores.pkl", "rb") as input_file:
    datax = pkl.load(input_file)

    print(datax)



In [16]:
def train_predict_compare(model, predict, gold_type):
    reg_model = None

    # versions of trained regression models
    if model == 'general':
        with open("/workspace/datasets/model_200.pkl", "rb") as input_file:
            reg_model = pkl.load(input_file)

    if model == 'arts94':
        X_ta, y_ta = load_ARTS_data(94)
        reg_model = reg_rf.fit(X_ta, y_ta)
    if model == 'arts300':
        X_ta, y_ta = load_ARTS_data(300)
        reg_model = reg_rf.fit(X_ta, y_ta)

    if model == 'ta_children':
        X_ta, y_ta = load_200_data(['britannica', 'TurkCorpus'], make_categorical_data=False, use1249LFs=True, multiple_datasets=True)
        reg_model = reg_rf.fit(X_ta, y_ta)
    if model == 'ta_language':
        X_ta, y_ta = load_200_data(['NNSeval', 'OneStopEnglish', 'simpa', 'TurkCorpus'], make_categorical_data=False, use1249LFs=True, multiple_datasets=True)
        reg_model = reg_rf.fit(X_ta, y_ta)
    if model == 'd_medical':
        X_ta, y_ta = load_200_data(['AutoMeTS', 'MTurkSF'], make_categorical_data=False, use1249LFs=True, multiple_datasets=True)
        reg_model = reg_rf.fit(X_ta, y_ta)
    if model == 'd_news':
        X_ta, y_ta = load_200_data(['HutSSF', 'OneStopEnglish'], make_categorical_data=False, use1249LFs=True, multiple_datasets=True)
        reg_model = reg_rf.fit(X_ta, y_ta)
    if model == 'd_encyclopedia':
        X_ta, y_ta = load_200_data(['britannica', 'EW-SEW-Turk', 'metaeval', 'NNSeval', 'SimpEval_22', 'TurkCorpus'], make_categorical_data=False, use1249LFs=True, multiple_datasets=True)
        reg_model = reg_rf.fit(X_ta, y_ta)


    # versions of predicted data
    if predict == 94:
        arts_X, unused_y = load_ARTS_data(94)
        pred_y = reg_model.predict(arts_X)
    if predict == 300:
        arts_X, unused_y = load_ARTS_data(300)
        pred_y = reg_model.predict(arts_X)


    #versions of comparision data
    if gold_type == 'G5':
        unused_X, gold_y = load_ARTS_data(94)
    if gold_type == 'ChatGPT':
        unused_X, gold_y = load_ARTS_data(300)
        
    if gold_type == 'rand':
        gold_y = []
        random.seed(42)
        for i in range(len(pred_y)):
            gold_y.append(random.random())
    if gold_type == 'zero5':
        gold_y = []
        for i in range(len(pred_y)):
            gold_y.append(0.5)
    if gold_type == 'stable':
        gold_y = []
        for i in range(len(pred_y)):
            gold_y.append(0.5526)

    return pred_y, gold_y

In [4]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [21]:
model = 'd_news'
predict = 94
gold_type = 'G5'
new_g = [0.5] * 94

gold_y, pred_y = train_predict_compare(model, predict, gold_type)

msw = mean_absolute_error(gold_y, new_g)
print(msw)

print(mean_squared_error(gold_y, new_g))

print(r2_score(gold_y, new_g))



0.27544970204145736
0.09349789978328642
-1.17266680391845
