In [1]:
import numpy as np
import os

features_dir = "features"

In [2]:
# generate feature group containing all CMort-based metrics
# CM = NC + SS + WF + WL

node_count = os.path.join(features_dir, "node_count.npy")
node_count = np.load(node_count)
node_count = node_count.reshape((node_count.shape[0], -1))

syntactic_surprisal = os.path.join(features_dir, "syntactic_surprisal.npy")
syntactic_surprisal = np.load(syntactic_surprisal)
syntactic_surprisal = syntactic_surprisal.reshape((syntactic_surprisal.shape[0], -1))

word_frequency = os.path.join(features_dir, "word_frequency.npy")
word_frequency = np.load(word_frequency)
word_frequency = word_frequency.reshape((word_frequency.shape[0], -1))

word_length = os.path.join(features_dir, "word_length.npy")
word_length = np.load(word_length)
word_length = word_length.reshape((word_length.shape[0], -1))

all_complexity_metrics = np.hstack((node_count, syntactic_surprisal, word_frequency, word_length))
np.save(os.path.join(features_dir, "all_complexity_metrics.npy"), all_complexity_metrics)

In [4]:
# create CM + PU

punct = os.path.join(features_dir, "punct_final.npy")
punct = np.load(punct)

all_complexity_metrics_punct = np.hstack((all_complexity_metrics, punct))
np.save(os.path.join(features_dir, "all_complexity_metrics_punct.npy"), all_complexity_metrics_punct)

node_count_punct = np.hstack((node_count, punct))
syntactic_surprisal_punct = np.hstack((syntactic_surprisal, punct))
word_frequency_punct = np.hstack((word_frequency, punct))
word_length_punct = np.hstack((word_length, punct))
np.save(os.path.join(features_dir, "node_count_punct.npy"), node_count_punct)
np.save(os.path.join(features_dir, "syntactic_surprisal_punct.npy"), syntactic_surprisal_punct)
np.save(os.path.join(features_dir, "word_frequency_punct.npy"), word_frequency_punct)
np.save(os.path.join(features_dir, "word_length_punct.npy"), word_length_punct)

In [5]:
# create PD + CM + PU

pos_tags = os.path.join(features_dir, "pos_tags.npy")
dep_tags = os.path.join(features_dir, "dep_tags.npy")
pos_tags = np.load(pos_tags)
dep_tags = np.load(dep_tags)

pos_dep_tags = np.hstack((pos_tags,dep_tags))
np.save(os.path.join(features_dir, "pos_dep_tags.npy"), pos_dep_tags)
pos_dep_tags_all_complexity_metrics = np.hstack((pos_dep_tags, all_complexity_metrics))
np.save(os.path.join(features_dir, "pos_dep_tags_all_complexity_metrics.npy"), pos_dep_tags_all_complexity_metrics)

In [6]:
# create hierarchical spaces for contrege_comp, contrege_incomp, incontrege and the bert embeddings

num_contrege_sets_per_space = 5
contrege_sets = ["contrege_comp", "contrege_incomp", "incontrege"]

bert_pca = os.path.join(features_dir, "incremental_bert_embeddings_layer12_PCA_dims_15.npy")
bert_pca = np.load(bert_pca)

for set_name in contrege_sets:
    for i in range(num_contrege_sets_per_space):
        cs = np.load(os.path.join(features_dir, set_name + "_set_{}.npy".format(i)))
        cs_pos_dep_tags_all_complexity_metrics = np.hstack((cs, pos_dep_tags_all_complexity_metrics))
        np.save(os.path.join(features_dir, "{}_set_{}_pos_dep_tags_all_complexity_metrics.npy".format(set_name, i)), cs_pos_dep_tags_all_complexity_metrics)        
        if set_name == "contrege_incomp":
            bert_cs_pos_dep_tags_all_complexity_metrics = np.hstack((bert_pca, cs, pos_dep_tags_all_complexity_metrics))
            np.save(os.path.join(features_dir, "bert_PCA_dims_15_{}_set_{}_pos_dep_tags_all_complexity_metrics.npy".format(set_name, i)), bert_cs_pos_dep_tags_all_complexity_metrics)