In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression, LinearRegression, RidgeClassifierCV, RidgeCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import cross_val_score
import pandas as pd
from collections import Counter
from scipy.stats import wilcoxon
from IPython.core.debugger import set_trace
import os
import pdb

np.random.seed(97)

In [2]:
all_features_to_predict = ["word_frequency", "spacy_semantic_vecs", "pos_tags", "node_count", "syntactic_surprisal"]
metrics = ["r2", "r2", "accuracy", "r2", "r2"]
all_input_features = ["incremental_bert_embeddings_layer12", "contrege_comp_set_0", "contrege_comp_set_1", "contrege_comp_set_2", "contrege_comp_set_3", "contrege_comp_set_4"]

In [3]:
all_x = []
all_y = []

for i, feat in enumerate(all_input_features):
    x = np.load(os.path.join("features", feat + ".npy"))
    all_x.append(x)

# ',': 11, '.': 13, ':': 22, '``': 23, "''": 24, '-LRB-': 37, '-RRB-': 38
punctuation_dims = [11, 13, 22, 23, 24, 37, 38]
for j, out in enumerate(all_features_to_predict):
    y = np.load(os.path.join("features", out + ".npy"))
#     if out == "fastText":
#         pca = PCA(n_components=100)
#         y = pca.fit_transform(y)
#         print(y.shape)
    if out == "pos_tags":
        y_mod = []
        for i in range(y.shape[0]):
            labels = np.where(y[i] == 1)[0]
            if labels.shape[0] == 1:
                y_mod.append(labels[0])
            elif labels.shape[0] > 1:
                check = False
                for l in labels:
                    if not l in punctuation_dims:
                        y_mod.append(l)
                        check = True
                        break
                if not check:
                    y_mod.append(labels[0])
            else:
                y_mod.append(40) # "+" symbol which is used for fixation
        y = np.array(y_mod)
    all_y.append(y)

In [4]:
# from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt

# pca = PCA().fit(all_y[1])
# plt.plot(np.cumsum(pca.explained_variance_ratio_))
# plt.xlabel('number of components')
# plt.ylabel('cumulative explained variance');

In [5]:
all_scores = {}
for i, feat in enumerate(all_input_features):
    print("Predicting using {}".format(feat))
    for j, out in enumerate(all_features_to_predict):
        np.random.seed(97)
        clf = None
        skf = None
        if metrics[j] == "accuracy":
            clf = LogisticRegression(random_state=0, multi_class="multinomial", max_iter=5000)
            skf = KFold(n_splits=10)
        else:
            clf = RidgeCV(alphas=np.logspace(-10, 10, num=10), scoring=metrics[j])
            skf = KFold(n_splits=10)
        scores = cross_val_score(clf, all_x[i], all_y[j], cv=skf, scoring=metrics[j])
        print("{} while predicting {} = {}".format(metrics[j], out, scores.mean()))
        all_scores["{}_{}".format(feat, out)] = scores
    print()

Predicting using incremental_bert_embeddings_layer12
r2 while predicting word_frequency = 0.6176506132434076
r2 while predicting spacy_semantic_vecs = 0.1853684590378321
accuracy while predicting pos_tags = 0.6130411566581779
r2 while predicting node_count = 0.4264671774529605
r2 while predicting syntactic_surprisal = 0.21338211378214994

Predicting using contrege_comp_set_0
r2 while predicting word_frequency = 0.2131402223235316
r2 while predicting spacy_semantic_vecs = 0.05180721910327213
accuracy while predicting pos_tags = 0.7279609120034654
r2 while predicting node_count = 0.13272405995926162
r2 while predicting syntactic_surprisal = 0.14840505342373067

Predicting using contrege_comp_set_1
r2 while predicting word_frequency = 0.202722877152706
r2 while predicting spacy_semantic_vecs = 0.052162110515056835
accuracy while predicting pos_tags = 0.7130908194737982
r2 while predicting node_count = 0.18948848760046885
r2 while predicting syntactic_surprisal = 0.1610505726661604

Predic

In [21]:
all_scores

{'incremental_bert_embeddings_layer12_word_frequency': array([-0.17204307,  0.41182346,  0.35154598,  0.28444037,  0.41799525,
         0.26960325,  0.27251042,  0.23901261,  0.31365013,  0.37044526]),
 'incremental_bert_embeddings_layer12_fastText': array([-0.86820195, -0.48222275, -0.46482363, -0.49671835, -0.55802907,
        -0.65501019, -0.35686091, -0.65347806, -0.49702699, -0.5776003 ]),
 'incremental_bert_embeddings_layer12_pos_tags': array([0.54247104, 0.58494208, 0.61389961, 0.57142857, 0.66409266,
        0.6003861 , 0.65764023, 0.62088975, 0.64023211, 0.6344294 ]),
 'incremental_bert_embeddings_layer12_node_count': array([ 0.10185245,  0.18024643, -0.01736591, -0.02148069, -0.31129771,
        -0.13149955,  0.1099407 , -0.04224845,  0.05174373,  0.14597573]),
 'incremental_bert_embeddings_layer12_syntactic_surprisal': array([-0.58511652, -0.5027491 , -0.25335811, -0.1042836 , -0.3319272 ,
        -0.29208119, -0.19194023, -0.29994372, -0.41386996, -0.38299246]),
 'contrege_

In [10]:
all_scores_ridge = {}
for i, feat in enumerate(all_input_features):
    print("Predicting using {}".format(feat))
    for j, out in enumerate(all_features_to_predict):
        np.random.seed(97)
        clf = None
        skf = None
        if metrics[j] == "accuracy":
            skf = KFold(n_splits=10)
            clf = RidgeClassifierCV(alphas=np.logspace(-10, 10, num=10), scoring=metrics[j])
        else:
            skf = KFold(n_splits=10)
            clf = RidgeCV(alphas=np.logspace(-10, 10, num=10), scoring=metrics[j])
        scores = cross_val_score(clf, all_x[i], all_y[j], cv=skf, scoring=metrics[j])
        print("{} while predicting {} = {}".format(metrics[j], out, scores.mean()))
        all_scores_ridge["{}_{}".format(feat, out)] = scores
    print()

Predicting using incremental_bert_embeddings_layer12
r2 while predicting word_frequency = 0.3758678566515151
r2 while predicting fastText = 0.06295898803237668
accuracy while predicting pos_tags = 0.2351246798055308
r2 while predicting node_count = 0.08721654392893403
r2 while predicting syntactic_surprisal = 0.12892627986390812

Predicting using contrege_comp_set_0
r2 while predicting word_frequency = 0.2131402223235316
r2 while predicting fastText = 0.04250149567257532
accuracy while predicting pos_tags = 0.5761162931375698
r2 while predicting node_count = 0.13272405995926162
r2 while predicting syntactic_surprisal = 0.14840505342373067

Predicting using contrege_comp_set_1
r2 while predicting word_frequency = 0.202722877152706
r2 while predicting fastText = 0.0433793406694456
accuracy while predicting pos_tags = 0.572826971763142
r2 while predicting node_count = 0.18948848760046885
r2 while predicting syntactic_surprisal = 0.1610505726661604

Predicting using contrege_comp_set_2
r2 

In [16]:
all_scores_ridge = {}
for i, feat in enumerate(all_input_features):
    print("Predicting using {}".format(feat))
    for j, out in enumerate(all_features_to_predict):
        if out != "fastText":
            continue
        np.random.seed(97)
        clf = None
        skf = None
        if metrics[j] == "accuracy":
            skf = KFold(n_splits=10)
            clf = RidgeClassifierCV(alphas=np.logspace(-10, 10, num=10), scoring=metrics[j])
        else:
            skf = KFold(n_splits=10)
            clf = RidgeCV(alphas=np.logspace(-10, 10, num=10), scoring=metrics[j])
        scores = cross_val_score(clf, all_x[i], all_y[j], cv=skf, scoring=metrics[j])
        print("{} while predicting {} = {}".format(metrics[j], out, scores.mean()))
        all_scores_ridge["{}_{}".format(feat, out)] = scores
    print()

Predicting using incremental_bert_embeddings_layer12
r2 while predicting fastText = 0.06504996814306294

Predicting using contrege_comp_set_0
r2 while predicting fastText = 0.016479037585760185

Predicting using contrege_comp_set_1
r2 while predicting fastText = 0.016096265390334

Predicting using contrege_comp_set_2
r2 while predicting fastText = 0.016684313403571387

Predicting using contrege_comp_set_3
r2 while predicting fastText = 0.01650641399477072

Predicting using contrege_comp_set_4
r2 while predicting fastText = 0.016294676179709502



In [32]:
all_scores_ridge = {}
for i, feat in enumerate(all_input_features):
    print("Predicting using {}".format(feat))
    for j, out in enumerate(all_features_to_predict):
        np.random.seed(97)
        clf = None
        skf = None
        if metrics[j] == "accuracy":
            if i == 0:
                continue
            skf = KFold(n_splits=10)
            clf = LogisticRegression(random_state=0, multi_class="multinomial", max_iter=5000)
        else:
            skf = KFold(n_splits=10)
            clf = RidgeCV(alphas=np.logspace(-3, 6, num=10), scoring=metrics[j])
        scores = cross_val_score(clf, all_x[i], all_y[j], cv=skf, scoring=metrics[j])
        print("{} while predicting {} = {}".format(metrics[j], out, scores.mean()))
        all_scores_ridge["{}_{}".format(feat, out)] = scores
    print()

Predicting using incremental_bert_embeddings_layer12
r2 while predicting word_frequency = 0.5916723588608107
r2 while predicting fastText = 0.11334612526928942
r2 while predicting node_count = 0.4186099559196049
r2 while predicting syntactic_surprisal = 0.1966278790962245

Predicting using contrege_incomp_set_0
r2 while predicting word_frequency = 0.21799889537640307
r2 while predicting fastText = 0.034505211262141


KeyboardInterrupt: 

In [6]:
node_count = np.load("features/node_count.npy")

In [9]:
(node_count > 1).sum()

2440