In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import pandas as pd
from collections import Counter
from scipy.stats import wilcoxon
from IPython.core.debugger import set_trace

np.random.seed(97)

In [2]:
# contrege results
all_sets = ["contrege_comp", "contrege_incomp", "incontrege"]
num_sets = 5
sets_dir = "features"
y_dir = "ancestor_information_analysis"
clf = LogisticRegression(random_state=0, multi_class="multinomial", max_iter=5000)

y_labels = []
all_y = []
for level in range(2,10):
    y = np.load("{}/level_{}_ancestors.npy".format(y_dir, level))
    y_mod = np.argmax(y, axis = 1)
    all_y.append(y_mod)   
    y_labels.append("level_{}_ancestors".format(level))
    
results = []
for se in range(len(all_sets)):
    set_name = all_sets[se]
    print(set_name)
    all_x = []
    for i in range(num_sets):
        x = np.load("{}/{}_set_{}.npy".format(sets_dir, set_name, i))
        all_x.append(x)
        
    print(len(all_x))
    print(all_x[0].shape)
    result = {"Feature": set_name}
    
    for i in range(len(all_y)):
        agg = 0.0
        all_scores = [0.0]*10
        all_chance = []
        for x_it in all_x:
            skf = StratifiedKFold(n_splits=10)
            scores = cross_val_score(clf, x_it, all_y[i], cv=skf, scoring="accuracy")
            agg += scores.mean()
            for j in range(scores.shape[0]):
                all_scores[j] += scores[j]/len(all_x)
            
        for train_index, test_index in skf.split(all_x[0], all_y[i]):
            y_train, y_test = all_y[i][train_index], all_y[i][test_index]
            c = Counter(list(y_train))
            majority_label, count = c.most_common()[0]
            all_chance.append(float(count) / y_train.shape[0])
        
        print(all_scores)
        agg /= len(all_x)
        result[y_labels[i]] = agg
        result["p_val_" + y_labels[i]] = wilcoxon(all_scores, all_chance, zero_method="zsplit").pvalue
    print(result)
    results.append(result)

contrege_comp
5
(5176, 15)
[0.6722007722007721, 0.6691119691119691, 0.6671814671814671, 0.6629343629343629, 0.6992277992277992, 0.6602316602316602, 0.6618955512572533, 0.6684719535783366, 0.6580270793036751, 0.6560928433268859]
[0.5030888030888031, 0.505019305019305, 0.4876447876447877, 0.5212355212355212, 0.5027027027027027, 0.5077220077220077, 0.5272727272727272, 0.5272727272727273, 0.5106382978723404, 0.5098646034816248]
[0.6003861003861004, 0.5872586872586872, 0.6154440154440154, 0.6, 0.6069498069498069, 0.5926640926640927, 0.6050290135396519, 0.597678916827853, 0.5760154738878144, 0.5694390715667311]
[0.6432432432432433, 0.6791505791505791, 0.6918918918918919, 0.672972972972973, 0.6895752895752896, 0.6806949806949807, 0.6955512572533848, 0.7056092843326885, 0.6560928433268858, 0.6800773694390716]




[0.7598455598455599, 0.7768339768339767, 0.7779922779922779, 0.781853281853282, 0.7706563706563707, 0.7857142857142857, 0.7810444874274662, 0.7833655705996132, 0.7415860735009672, 0.7694390715667312]




[0.8003861003861004, 0.8239382239382239, 0.8308880308880309, 0.832046332046332, 0.8169884169884171, 0.832046332046332, 0.8340425531914893, 0.8332688588007737, 0.7872340425531915, 0.8174081237911025]




[0.8432432432432432, 0.8667953667953667, 0.8776061776061777, 0.8629343629343631, 0.8671814671814672, 0.8787644787644788, 0.8754352030947776, 0.8704061895551257, 0.825918762088975, 0.8580270793036749]




[0.88996138996139, 0.91003861003861, 0.898069498069498, 0.9030888030888031, 0.8922779922779923, 0.9027027027027028, 0.9102514506769825, 0.9090909090909092, 0.8591876208897484, 0.8928433268858802]
{'Feature': 'contrege_comp', 'level_2_ancestors': 0.6675375458354182, 'p_val_level_2_ancestors': 0.001953125, 'level_3_ancestors': 0.5102461483312547, 'p_val_level_3_ancestors': 0.001953125, 'level_4_ancestors': 0.5950865178524753, 'p_val_level_4_ancestors': 0.001953125, 'level_5_ancestors': 0.6794859711880988, 'p_val_level_5_ancestors': 0.001953125, 'level_6_ancestors': 0.7728330955990531, 'p_val_level_6_ancestors': 0.001953125, 'level_7_ancestors': 0.8208247014629994, 'p_val_level_7_ancestors': 0.001953125, 'level_8_ancestors': 0.862631233056765, 'p_val_level_8_ancestors': 0.001953125, 'level_9_ancestors': 0.8967512303682517, 'p_val_level_9_ancestors': 0.001953125}
contrege_incomp
5
(5176, 15)
[0.5281853281853283, 0.5073359073359073, 0.5339768339768339, 0.5193050193050193, 0.5424710424710425



KeyboardInterrupt: 

In [None]:
# other features
all_features = ["pos_dep_tags", "node_count", "syntactic_surprisal", "word_frequency", "word_length", "all_complexity_metrics", "incremental_bert_embeddings_layer12_PCA_dims_15"]
features_dir = "features"
y_dir = "ancestor_information_analysis"
clf = LogisticRegression(random_state=0, multi_class="multinomial", max_iter=5000)

y_labels = []
all_y = []
for level in range(2,10):
    y = np.load("{}/level_{}_ancestors.npy".format(y_dir, level))
    y_mod = np.argmax(y, axis = 1)
    all_y.append(y_mod)   
    y_labels.append("level_{}_ancestors".format(level))
    
for fe in range(len(all_features)):
    feat_name = all_features[fe]
    print(feat_name)
    x = np.load("{}/{}.npy".format(features_dir, feat_name))
    x = x.reshape((x.shape[0], -1))
    
    result = {"Feature": feat_name}
    for i in range(len(all_y)):
        all_scores = []
        all_chance = []
        skf = StratifiedKFold(n_splits=10)
        scores = cross_val_score(clf, x, all_y[i], cv=skf, scoring="accuracy")
        agg = scores.mean()
        all_scores = list(scores)

        for train_index, test_index in skf.split(x, all_y[i]):
            y_train, y_test = all_y[i][train_index], all_y[i][test_index]
            c = Counter(list(y_train))
            majority_label, count = c.most_common()[0]
            all_chance.append(float(count) / y_train.shape[0])

        print(all_scores)
        result[y_labels[i]] = agg
        result["p_val_" + y_labels[i]] = wilcoxon(all_scores, all_chance, zero_method="zsplit").pvalue
        print(result["p_val_" + y_labels[i]])
    
    print(result)
#     results.append(result)

In [None]:
np.save("syntactic_information_analysis_results.npy", results)

In [None]:
results = np.load("syntactic_information_analysis_results.npy", allow_pickle = True).tolist()
for i in range(len(results)):
    for k in results[i]:
        if k.startswith("level"):
            results[i][k] *= 100
            results[i][k] = np.round(results[i][k], 2)

np.save("syntactic_information_analysis_results_formatted",results)

df = pd.DataFrame(results)
df.to_csv("final_syntactic_information_analysis_results.csv")

In [None]:
# generate table for label distribution
dist = []
label_names = {0: "noun_phrase", 1: "verb_phrase", 2: "adverb_phrase", 3: "adjective_phrase", 4: "prepositional_phrase", 5: "clause", 6: "other"}

for i in range(len(all_y)):
    d = {"level" : y_labels[i]}
    for lab in range(7):
        su = (all_y[i] == lab).sum()
        d[label_names[lab]] = su
        d[label_names[lab] + "(%)"] = su/all_y[i].shape[0]
    dist.append(d)
    
dist_df = pd.DataFrame(dist)
dist_df