In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import pandas as pd
from collections import Counter
from scipy.stats import wilcoxon
from IPython.core.debugger import set_trace

np.random.seed(97)

In [2]:
# contrege results
all_sets = ["contrege_comp", "contrege_incomp", "incontrege"]
num_sets = 5
sets_dir = "features"
y_dir = "ancestor_information_analysis"
clf = LogisticRegression(random_state=0, multi_class="multinomial", max_iter=5000)

y_labels = []
all_y = []
for level in range(2,10):
    y = np.load("{}/level_{}_ancestors.npy".format(y_dir, level))
    y_mod = np.argmax(y, axis = 1)
    all_y.append(y_mod)   
    y_labels.append("level_{}_ancestors".format(level))
    
results = []
for se in range(len(all_sets)):
    set_name = all_sets[se]
    print(set_name)
    all_x = []
    for i in range(num_sets):
        x = np.load("{}/{}_set_{}.npy".format(sets_dir, set_name, i))
        all_x.append(x)
        
    print(len(all_x))
    print(all_x[0].shape)
    result = {"Feature": set_name}
    
    for i in range(len(all_y)):
        agg = 0.0
        all_scores = [0.0]*10
        all_chance = []
        for x_it in all_x:
            skf = StratifiedKFold(n_splits=10)
            scores = cross_val_score(clf, x_it, all_y[i], cv=skf, scoring="accuracy")
            agg += scores.mean()
            for j in range(scores.shape[0]):
                all_scores[j] += scores[j]/len(all_x)
            
        for train_index, test_index in skf.split(all_x[0], all_y[i]):
            y_train, y_test = all_y[i][train_index], all_y[i][test_index]
            c = Counter(list(y_train))
            majority_label, count = c.most_common()[0]
            all_chance.append(float(count) / y_train.shape[0])
        
        print(all_scores)
        agg /= len(all_x)
        result[y_labels[i]] = agg
        result["p_val_" + y_labels[i]] = wilcoxon(all_scores, all_chance, zero_method="zsplit").pvalue
    print(result)
    results.append(result)

contrege_comp
5
(5176, 15)




{'Feature': 'contrege_comp', 'level_2_ancestors': 0.6656877897990727, 'level_3_ancestors': 0.5113601236476043, 'level_4_ancestors': 0.596870170015456, 'level_5_ancestors': 0.6792117465224111, 'level_6_ancestors': 0.7727202472952086, 'level_7_ancestors': 0.8208268933539411, 'level_8_ancestors': 0.8626738794435859, 'level_9_ancestors': 0.8968315301391037}
contrege_incomp
5
(5176, 15)




{'Feature': 'contrege_incomp', 'level_2_ancestors': 0.5282843894899537, 'level_3_ancestors': 0.45819165378670784, 'level_4_ancestors': 0.5789799072642967, 'level_5_ancestors': 0.6744976816074189, 'level_6_ancestors': 0.770788253477589, 'level_7_ancestors': 0.8246908809891809, 'level_8_ancestors': 0.8658809891808346, 'level_9_ancestors': 0.9037480680061825}
incontrege
5
(5176, 15)




{'Feature': 'incontrege', 'level_2_ancestors': 0.5265842349304484, 'level_3_ancestors': 0.45568006182380216, 'level_4_ancestors': 0.5788639876352396, 'level_5_ancestors': 0.667967542503864, 'level_6_ancestors': 0.7613987635239567, 'level_7_ancestors': 0.8113601236476043, 'level_8_ancestors': 0.8515455950540959, 'level_9_ancestors': 0.8904559505409584}


In [3]:
# other features
all_features = ["pos_dep_tags", "node_count", "syntactic_surprisal", "word_frequency", "word_length", "all_effort_based_metrics", "incremental_bert_embeddings_layer12_PCA_dims_15"]
features_dir = "features"
y_dir = "ancestor_information_analysis"
clf = LogisticRegression(random_state=0, multi_class="multinomial", max_iter=5000)

y_labels = []
all_y = []
for level in range(2,10):
    y = np.load("{}/level_{}_ancestors.npy".format(y_dir, level))
    y_mod = np.argmax(y, axis = 1)
    all_y.append(y_mod)   
    y_labels.append("level_{}_ancestors".format(level))
    
for fe in range(len(all_features)):
    feat_name = all_features[fe]
    print(feat_name)
    x = np.load("{}/{}.npy".format(features_dir, feat_name))
    x = x.reshape((x.shape[0], -1))
    
    result = {"Feature": feat_name}
    for i in range(len(all_y)):
        all_scores = []
        all_chance = []
        skf = StratifiedKFold(n_splits=10)
        scores = cross_val_score(clf, x, all_y[i], cv=skf, scoring="accuracy")
        agg = scores.mean()
        all_scores = list(scores)

        for train_index, test_index in skf.split(x, all_y[i]):
            y_train, y_test = all_y[i][train_index], all_y[i][test_index]
            c = Counter(list(y_train))
            majority_label, count = c.most_common()[0]
            all_chance.append(float(count) / y_train.shape[0])

        print(all_scores)
        result[y_labels[i]] = agg
        result["p_val_" + y_labels[i]] = wilcoxon(all_scores, all_chance, zero_method="zsplit").pvalue
        print(result["p_val_" + y_labels[i]])
    
    print(result)
#     results.append(result)

FileNotFoundError: [Errno 2] No such file or directory: 'features/pos_dep_tags.npy'

In [None]:
np.save("syntactic_information_analysis_results.npy", results)

In [None]:
results = np.load("syntactic_information_analysis_results.npy", allow_pickle = True).tolist()
for i in range(len(results)):
    for k in results[i]:
        if k.startswith("level"):
            results[i][k] *= 100
            results[i][k] = np.round(results[i][k], 2)

np.save("syntactic_information_analysis_results_formatted",results)

df = pd.DataFrame(results)
df.to_csv("final_syntactic_information_analysis_results.csv")

In [None]:
# generate table for label distribution
dist = []
label_names = {0: "noun_phrase", 1: "verb_phrase", 2: "adverb_phrase", 3: "adjective_phrase", 4: "prepositional_phrase", 5: "clause", 6: "other"}

for i in range(len(all_y)):
    d = {"level" : y_labels[i]}
    for lab in range(7):
        su = (all_y[i] == lab).sum()
        d[label_names[lab]] = su
        d[label_names[lab] + "(%)"] = su/all_y[i].shape[0]
    dist.append(d)
    
dist_df = pd.DataFrame(dist)
dist_df