In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression, LinearRegression, RidgeClassifierCV, RidgeCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import cross_val_score
import pandas as pd
from collections import Counter
from scipy.stats import wilcoxon
from IPython.core.debugger import set_trace
import os
import pdb

np.random.seed(97)

In [2]:
all_features_to_predict = ["spacy_semantic_vecs"]
metrics = ["r2"]
all_input_features = ["incremental_bert_embeddings_layer12", \
                      "contrege_comp_set_0", "contrege_comp_set_1", "contrege_comp_set_2", "contrege_comp_set_3", "contrege_comp_set_4",\
                     "contrege_incomp_set_0", "contrege_incomp_set_1", "contrege_incomp_set_2", "contrege_incomp_set_3", "contrege_incomp_set_4",\
                     "incontrege_set_0", "incontrege_set_1", "incontrege_set_2", "incontrege_set_3", "incontrege_set_4"]

In [3]:
all_x = []
all_y = []

for i, feat in enumerate(all_input_features):
    x = np.load(os.path.join("features", feat + ".npy"))
    all_x.append(x)

for j, out in enumerate(all_features_to_predict):
    y = np.load(os.path.join("features", out + ".npy"))
    all_y.append(y)

In [4]:
all_scores = {}
for i, feat in enumerate(all_input_features):
    print("Predicting using {}".format(feat))
    for j, out in enumerate(all_features_to_predict):
        np.random.seed(97)
        clf = RidgeCV(alphas=np.logspace(-10, 10, num=10), scoring=metrics[j])
        skf = KFold(n_splits=10)
        scores = cross_val_score(clf, all_x[i], all_y[j], cv=skf, scoring=metrics[j])
        print("{} while predicting {} = {}".format(metrics[j], out, scores.mean()))
        all_scores["{}_{}".format(feat, out)] = scores
    print()

Predicting using incremental_bert_embeddings_layer12
r2 while predicting spacy_semantic_vecs = 0.18536845903783322

Predicting using contrege_comp_set_0
r2 while predicting spacy_semantic_vecs = 0.05180721910327216

Predicting using contrege_comp_set_1
r2 while predicting spacy_semantic_vecs = 0.052162110515056835

Predicting using contrege_comp_set_2
r2 while predicting spacy_semantic_vecs = 0.05134132085576508

Predicting using contrege_comp_set_3
r2 while predicting spacy_semantic_vecs = 0.0523368386700116

Predicting using contrege_comp_set_4
r2 while predicting spacy_semantic_vecs = 0.050521799677966994

Predicting using contrege_incomp_set_0
r2 while predicting spacy_semantic_vecs = 0.019135909353539683

Predicting using contrege_incomp_set_1
r2 while predicting spacy_semantic_vecs = 0.020406839394256977

Predicting using contrege_incomp_set_2
r2 while predicting spacy_semantic_vecs = 0.01857878511735908

Predicting using contrege_incomp_set_3
r2 while predicting spacy_semantic_v

In [5]:
all_contrege_sets = ["contrege_comp", "contrege_incomp", "incontrege"]
num_sets = 5

for s in all_contrege_sets:
    avg = 0.0
    for i in range(num_sets):
        avg += all_scores["{}_set_{}_spacy_semantic_vecs".format(s, i)].mean() / num_sets
    print("Average R2 for {} = {}".format(s, avg))

Average R2 for contrege_comp = 0.05163385776441454
Average R2 for contrege_incomp = 0.019600973899276328
Average R2 for incontrege = 0.021470843240101255
