In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression, LinearRegression, RidgeClassifierCV, RidgeCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import cross_val_score
import pandas as pd
from collections import Counter
from scipy.stats import wilcoxon
from IPython.core.debugger import set_trace
import os
import pdb

np.random.seed(97)

In [2]:
# contrege results
all_sets = ["contrege_comp", "contrege_incomp", "incontrege"]
num_sets = 5
sets_dir = "features"
y_dir = "ancestor_information_analysis"
clf = RidgeClassifierCV(alphas=np.logspace(-10, 10, num=20), scoring="accuracy")

y_labels = []
all_y = []
for level in range(2,10):
    y = np.load("{}/level_{}_ancestors.npy".format(y_dir, level))
    y_mod = np.argmax(y, axis = 1)
    all_y.append(y_mod)   
    y_labels.append("level_{}_ancestors".format(level))
    
results = []
for se in range(len(all_sets)):
    set_name = all_sets[se]
    print(set_name)
    all_x = []
    for i in range(num_sets):
        x = np.load("{}/{}_set_{}.npy".format(sets_dir, set_name, i))
        all_x.append(x)
        
    print(len(all_x))
    print(all_x[0].shape)
    result = {"Feature": set_name}
    
    for i in range(len(all_y)):
        agg = 0.0
        all_scores = [0.0]*10
        all_chance = []
        for x_it in all_x:
            skf = StratifiedKFold(n_splits=10)
            scores = cross_val_score(clf, x_it, all_y[i], cv=skf, scoring="accuracy")
            agg += scores.mean()
            for j in range(scores.shape[0]):
                all_scores[j] += scores[j]/len(all_x)
            
        for train_index, test_index in skf.split(all_x[0], all_y[i]):
            y_train, y_test = all_y[i][train_index], all_y[i][test_index]
            c = Counter(list(y_train))
            majority_label, count = c.most_common()[0]
            all_chance.append(float(count) / y_train.shape[0])
        
        print(all_scores)
        agg /= len(all_x)
        result[y_labels[i]] = agg
        result["p_val_" + y_labels[i]] = wilcoxon(all_scores, all_chance, zero_method="zsplit").pvalue
    print(result)
    results.append(result)

contrege_comp
5
(5176, 15)
[0.5706563706563706, 0.5849420849420849, 0.5644787644787644, 0.5791505791505791, 0.5706563706563706, 0.5787644787644787, 0.5876208897485493, 0.5798839458413926, 0.5926499032882012, 0.5500967117988395]
[0.48880308880308876, 0.5023166023166022, 0.4772200772200772, 0.5073359073359073, 0.4772200772200772, 0.4922779922779923, 0.5141199226305609, 0.5218568665377176, 0.4978723404255319, 0.4990328820116054]
[0.5540540540540541, 0.5447876447876447, 0.55984555984556, 0.5486486486486487, 0.5471042471042471, 0.5420849420849422, 0.5470019342359768, 0.5435203094777563, 0.5454545454545454, 0.5535783365570599]
[0.640926640926641, 0.6482625482625484, 0.640926640926641, 0.640926640926641, 0.640926640926641, 0.638996138996139, 0.6402321083172147, 0.6402321083172147, 0.6402321083172147, 0.6402321083172147]




[0.7517374517374518, 0.7779922779922779, 0.7602316602316601, 0.7528957528957527, 0.7555984555984556, 0.7752895752895752, 0.7628626692456479, 0.7736943907156673, 0.7427466150870408, 0.7613152804642167]




[0.7972972972972973, 0.8173745173745174, 0.832046332046332, 0.8119691119691119, 0.8096525096525097, 0.822007722007722, 0.8147001934235978, 0.8135396518375242, 0.7872340425531915, 0.816247582205029]




[0.8359073359073359, 0.8498069498069498, 0.8567567567567568, 0.832046332046332, 0.849034749034749, 0.8335907335907335, 0.8762088974854932, 0.8704061895551257, 0.825918762088975, 0.8568665377176016]




[0.88996138996139, 0.9111969111969112, 0.8826254826254827, 0.8942084942084942, 0.8710424710424711, 0.874903474903475, 0.8796905222437136, 0.8889748549323018, 0.8607350096711799, 0.8851063829787233]
{'Feature': 'contrege_comp', 'level_2_ancestors': 0.5758900099325631, 'p_val_level_2_ancestors': 0.001953125, 'level_3_ancestors': 0.49780557567791606, 'p_val_level_3_ancestors': 0.001953125, 'level_4_ancestors': 0.5486080222250436, 'p_val_level_4_ancestors': 0.02734375, 'level_5_ancestors': 0.641189368423411, 'p_val_level_5_ancestors': 0.431640625, 'level_6_ancestors': 0.7614364129257747, 'p_val_level_6_ancestors': 0.001953125, 'level_7_ancestors': 0.8122068960366834, 'p_val_level_7_ancestors': 0.001953125, 'level_8_ancestors': 0.8486543243990052, 'p_val_level_8_ancestors': 0.001953125, 'level_9_ancestors': 0.8838444993764142, 'p_val_level_9_ancestors': 0.001953125}
contrege_incomp
5
(5176, 15)
[0.5208494208494208, 0.5104247104247104, 0.5223938223938224, 0.5135135135135135, 0.52355212355212



[0.7355212355212355, 0.7335907335907335, 0.7335907335907335, 0.7335907335907335, 0.7335907335907335, 0.7335907335907335, 0.7350096711798839, 0.7350096711798839, 0.7350096711798839, 0.7350096711798839]




[0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7833655705996132, 0.7833655705996132, 0.7833655705996132, 0.7833655705996132]




[0.8243243243243243, 0.8243243243243243, 0.8243243243243243, 0.8243243243243243, 0.8223938223938223, 0.8223938223938223, 0.8239845261121856, 0.8239845261121856, 0.8239845261121856, 0.8239845261121856]




[0.8714285714285714, 0.8903474903474904, 0.8656370656370658, 0.884942084942085, 0.884942084942085, 0.8837837837837839, 0.869245647969052, 0.8889748549323016, 0.8595744680851063, 0.8785299806576401]
{'Feature': 'contrege_incomp', 'level_2_ancestors': 0.5191295938104449, 'p_val_level_2_ancestors': 0.005859375, 'level_3_ancestors': 0.4407663756599927, 'p_val_level_3_ancestors': 0.001953125, 'level_4_ancestors': 0.5439716809929577, 'p_val_level_4_ancestors': 0.375, 'level_5_ancestors': 0.6404557776898203, 'p_val_level_5_ancestors': 0.4921875, 'level_6_ancestors': 0.7343513588194439, 'p_val_level_6_ancestors': 0.4921875, 'level_7_ancestors': 0.7824581973518144, 'p_val_level_7_ancestors': 0.556640625, 'level_8_ancestors': 0.8238023046533686, 'p_val_level_8_ancestors': 0.431640625, 'level_9_ancestors': 0.8777406032725181, 'p_val_level_9_ancestors': 0.001953125}
incontrege
5
(5176, 15)
[0.5347490347490347, 0.5239382239382239, 0.5301158301158301, 0.5096525096525096, 0.5382239382239383, 0.511969



[0.7355212355212355, 0.7335907335907335, 0.7335907335907335, 0.7335907335907335, 0.7335907335907335, 0.7335907335907335, 0.7350096711798839, 0.7350096711798839, 0.7350096711798839, 0.7350096711798839]




[0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7833655705996132, 0.7833655705996132, 0.7833655705996132, 0.7833655705996132]




[0.8243243243243243, 0.8243243243243243, 0.8243243243243243, 0.8243243243243243, 0.8223938223938223, 0.8223938223938223, 0.8239845261121856, 0.8239845261121856, 0.8239845261121856, 0.8239845261121856]




[0.859073359073359, 0.859073359073359, 0.8571428571428572, 0.8571428571428572, 0.8571428571428572, 0.8571428571428572, 0.8588007736943906, 0.8588007736943906, 0.8588007736943906, 0.8588007736943906]
{'Feature': 'incontrege', 'level_2_ancestors': 0.5225244393329499, 'p_val_level_2_ancestors': 0.01953125, 'level_3_ancestors': 0.43266603436816203, 'p_val_level_3_ancestors': 0.005859375, 'level_4_ancestors': 0.5442426233915596, 'p_val_level_4_ancestors': 0.845703125, 'level_5_ancestors': 0.6404557776898203, 'p_val_level_5_ancestors': 0.4921875, 'level_6_ancestors': 0.7343513588194439, 'p_val_level_6_ancestors': 0.4921875, 'level_7_ancestors': 0.7824581973518144, 'p_val_level_7_ancestors': 0.556640625, 'level_8_ancestors': 0.8238023046533686, 'p_val_level_8_ancestors': 0.431640625, 'level_9_ancestors': 0.858192124149571, 'p_val_level_9_ancestors': 0.556640625}




In [3]:
# other features
all_features = ["pos_dep_tags", "node_count", "syntactic_surprisal", "word_frequency", "word_length", "all_complexity_metrics", "incremental_bert_embeddings_layer12_PCA_dims_15"]
features_dir = "features"
y_dir = "ancestor_information_analysis"
clf = RidgeClassifierCV(alphas=np.logspace(-3, 6, num=10), scoring="accuracy")

y_labels = []
all_y = []
for level in range(2,10):
    y = np.load("{}/level_{}_ancestors.npy".format(y_dir, level))
    y_mod = np.argmax(y, axis = 1)
    all_y.append(y_mod)   
    y_labels.append("level_{}_ancestors".format(level))
    
for fe in range(len(all_features)):
    feat_name = all_features[fe]
    print(feat_name)
    x = np.load("{}/{}.npy".format(features_dir, feat_name))
    x = x.reshape((x.shape[0], -1))
    
    result = {"Feature": feat_name}
    for i in range(len(all_y)):
        all_scores = []
        all_chance = []
        skf = StratifiedKFold(n_splits=10)
        scores = cross_val_score(clf, x, all_y[i], cv=skf, scoring="accuracy")
        agg = scores.mean()
        all_scores = list(scores)

        for train_index, test_index in skf.split(x, all_y[i]):
            y_train, y_test = all_y[i][train_index], all_y[i][test_index]
            c = Counter(list(y_train))
            majority_label, count = c.most_common()[0]
            all_chance.append(float(count) / y_train.shape[0])

        print(all_scores)
        result[y_labels[i]] = agg
        result["p_val_" + y_labels[i]] = wilcoxon(all_scores, all_chance, zero_method="zsplit").pvalue
        print(result["p_val_" + y_labels[i]])
    
    print(result)
    results.append(result)

pos_dep_tags
[0.9247104247104247, 0.9208494208494209, 0.8957528957528957, 0.9266409266409267, 0.9362934362934363, 0.9247104247104247, 0.9090909090909091, 0.8878143133462283, 0.9168278529980658, 0.9129593810444874]
0.001953125
[0.6718146718146718, 0.6891891891891891, 0.7181467181467182, 0.7104247104247104, 0.7393822393822393, 0.7413127413127413, 0.6963249516441006, 0.7079303675048356, 0.7001934235976789, 0.7117988394584139]
0.001953125
[0.6988416988416989, 0.6351351351351351, 0.6177606177606177, 0.6891891891891891, 0.667953667953668, 0.637065637065637, 0.6305609284332688, 0.5841392649903289, 0.6711798839458414, 0.6421663442940039]
0.001953125
[0.6602316602316602, 0.694980694980695, 0.7162162162162162, 0.6891891891891891, 0.7123552123552124, 0.6872586872586872, 0.7214700193423598, 0.7079303675048356, 0.655705996131528, 0.6731141199226306]
0.001953125




[0.7760617760617761, 0.7741312741312741, 0.7722007722007722, 0.7915057915057915, 0.7664092664092664, 0.7915057915057915, 0.7833655705996132, 0.7833655705996132, 0.746615087040619, 0.7678916827852998]
0.001953125




[0.7895752895752896, 0.8281853281853282, 0.832046332046332, 0.8359073359073359, 0.8166023166023166, 0.832046332046332, 0.8336557059961315, 0.8317214700193424, 0.781431334622824, 0.8201160541586073]
0.00390625




[0.8436293436293436, 0.8667953667953668, 0.8764478764478765, 0.862934362934363, 0.8667953667953668, 0.8783783783783784, 0.8762088974854932, 0.8684719535783365, 0.8259187620889749, 0.8626692456479691]
0.001953125




[0.88996138996139, 0.9111969111969112, 0.8996138996138996, 0.9034749034749034, 0.8918918918918919, 0.9015444015444015, 0.9110251450676983, 0.90715667311412, 0.8607350096711799, 0.8974854932301741]
0.001953125
{'Feature': 'pos_dep_tags', 'level_2_ancestors': 0.915564998543722, 'p_val_level_2_ancestors': 0.001953125, 'level_3_ancestors': 0.7086517852475299, 'p_val_level_3_ancestors': 0.001953125, 'level_4_ancestors': 0.6473992367609388, 'p_val_level_4_ancestors': 0.001953125, 'level_5_ancestors': 0.6918452163133014, 'p_val_level_5_ancestors': 0.001953125, 'level_6_ancestors': 0.7753052582839817, 'p_val_level_6_ancestors': 0.001953125, 'level_7_ancestors': 0.820128749915984, 'p_val_level_7_ancestors': 0.00390625, 'level_8_ancestors': 0.8628249553781469, 'p_val_level_8_ancestors': 0.001953125, 'level_9_ancestors': 0.897408571876657, 'p_val_level_9_ancestors': 0.001953125}
node_count
[0.5096525096525096, 0.5096525096525096, 0.5096525096525096, 0.5096525096525096, 0.5096525096525096, 0.50965



[0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7833655705996132, 0.7833655705996132, 0.7833655705996132, 0.7833655705996132]
0.556640625
[0.8243243243243243, 0.8243243243243243, 0.8243243243243243, 0.8243243243243243, 0.8223938223938224, 0.8223938223938224, 0.8239845261121856, 0.8239845261121856, 0.8239845261121856, 0.8239845261121856]
0.431640625
[0.859073359073359, 0.859073359073359, 0.8571428571428571, 0.8571428571428571, 0.8571428571428571, 0.8571428571428571, 0.8588007736943907, 0.8588007736943907, 0.8588007736943907, 0.8588007736943907]
0.556640625
{'Feature': 'node_count', 'level_2_ancestors': 0.510046824940442, 'p_val_level_2_ancestors': 0.556640625, 'level_3_ancestors': 0.4213665862602033, 'p_val_level_3_ancestors': 0.001953125, 'level_4_ancestors': 0.5542896723747789, 'p_val_level_4_ancestors': 0.001953125, 'level_5_ancestors': 0.6400689304944625, 'p_val_level_5_ancestors': 0.845703125, 'level_6_ances



[0.4980694980694981, 0.4980694980694981, 0.5, 0.49034749034749037, 0.5096525096525096, 0.4942084942084942, 0.5009671179883946, 0.5183752417794971, 0.5106382978723404, 0.4932301740812379]
0.01953125
[0.4111969111969112, 0.37065637065637064, 0.3938223938223938, 0.39575289575289574, 0.4034749034749035, 0.38803088803088803, 0.390715667311412, 0.3771760154738878, 0.3655705996131528, 0.3713733075435203]
0.921875
[0.5444015444015444, 0.5444015444015444, 0.5444015444015444, 0.5444015444015444, 0.5444015444015444, 0.5444015444015444, 0.5454545454545454, 0.5435203094777563, 0.5435203094777563, 0.5435203094777563]
0.76953125
[0.640926640926641, 0.640926640926641, 0.640926640926641, 0.640926640926641, 0.640926640926641, 0.638996138996139, 0.6402321083172147, 0.6402321083172147, 0.6402321083172147, 0.6402321083172147]
0.4921875
[0.7355212355212355, 0.7335907335907336, 0.7335907335907336, 0.7335907335907336, 0.7335907335907336, 0.7335907335907336, 0.7350096711798839, 0.7350096711798839, 0.7350096711



[0.8243243243243243, 0.8243243243243243, 0.8243243243243243, 0.8243243243243243, 0.8223938223938224, 0.8223938223938224, 0.8239845261121856, 0.8239845261121856, 0.8239845261121856, 0.8239845261121856]
0.431640625
[0.859073359073359, 0.859073359073359, 0.8571428571428571, 0.8571428571428571, 0.8571428571428571, 0.8571428571428571, 0.8588007736943907, 0.8588007736943907, 0.8588007736943907, 0.8588007736943907]
0.556640625
{'Feature': 'syntactic_surprisal', 'level_2_ancestors': 0.5013558322068961, 'p_val_level_2_ancestors': 0.01953125, 'level_3_ancestors': 0.38677699528763354, 'p_val_level_3_ancestors': 0.921875, 'level_4_ancestors': 0.5442424740297082, 'p_val_level_4_ancestors': 0.76953125, 'level_5_ancestors': 0.6404557776898203, 'p_val_level_5_ancestors': 0.4921875, 'level_6_ancestors': 0.7343513588194439, 'p_val_level_6_ancestors': 0.4921875, 'level_7_ancestors': 0.7824581973518144, 'p_val_level_7_ancestors': 0.556640625, 'level_8_ancestors': 0.8238023046533686, 'p_val_level_8_ancesto



[0.7355212355212355, 0.7335907335907336, 0.7335907335907336, 0.7335907335907336, 0.7335907335907336, 0.7335907335907336, 0.7350096711798839, 0.7350096711798839, 0.7350096711798839, 0.7350096711798839]
0.4921875
[0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7833655705996132, 0.7833655705996132, 0.7833655705996132, 0.7833655705996132]
0.556640625
[0.8243243243243243, 0.8243243243243243, 0.8243243243243243, 0.8243243243243243, 0.8223938223938224, 0.8223938223938224, 0.8239845261121856, 0.8239845261121856, 0.8239845261121856, 0.8239845261121856]
0.431640625




[0.859073359073359, 0.859073359073359, 0.8571428571428571, 0.8571428571428571, 0.8571428571428571, 0.8571428571428571, 0.8588007736943907, 0.8588007736943907, 0.8588007736943907, 0.8588007736943907]
0.556640625
{'Feature': 'word_frequency', 'level_2_ancestors': 0.510046824940442, 'p_val_level_2_ancestors': 0.556640625, 'level_3_ancestors': 0.4157670104478616, 'p_val_level_3_ancestors': 0.001953125, 'level_4_ancestors': 0.5442424740297082, 'p_val_level_4_ancestors': 0.76953125, 'level_5_ancestors': 0.6404557776898203, 'p_val_level_5_ancestors': 0.4921875, 'level_6_ancestors': 0.7343513588194439, 'p_val_level_6_ancestors': 0.4921875, 'level_7_ancestors': 0.7824581973518144, 'p_val_level_7_ancestors': 0.556640625, 'level_8_ancestors': 0.8238023046533686, 'p_val_level_8_ancestors': 0.431640625, 'level_9_ancestors': 0.858192124149571, 'p_val_level_9_ancestors': 0.556640625}
word_length
[0.5096525096525096, 0.5096525096525096, 0.5096525096525096, 0.5096525096525096, 0.5096525096525096, 0.509



[0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7818532818532818, 0.7833655705996132, 0.7833655705996132, 0.7833655705996132, 0.7833655705996132]
0.556640625
[0.8243243243243243, 0.8243243243243243, 0.8243243243243243, 0.8243243243243243, 0.8223938223938224, 0.8223938223938224, 0.8239845261121856, 0.8239845261121856, 0.8239845261121856, 0.8239845261121856]
0.431640625
[0.859073359073359, 0.859073359073359, 0.8571428571428571, 0.8571428571428571, 0.8571428571428571, 0.8571428571428571, 0.8588007736943907, 0.8588007736943907, 0.8588007736943907, 0.8588007736943907]
0.556640625
{'Feature': 'word_length', 'level_2_ancestors': 0.510046824940442, 'p_val_level_2_ancestors': 0.556640625, 'level_3_ancestors': 0.3875574109616663, 'p_val_level_3_ancestors': 0.556640625, 'level_4_ancestors': 0.5442424740297082, 'p_val_level_4_ancestors': 0.76953125, 'level_5_ancestors': 0.6404557776898203, 'p_val_level_5_ancestors': 0.4921875, 'level_6_ancesto



[0.5231660231660231, 0.5, 0.5096525096525096, 0.5096525096525096, 0.5135135135135135, 0.5231660231660231, 0.5106382978723404, 0.5106382978723404, 0.5106382978723404, 0.5106382978723404]
0.10546875
[0.4420849420849421, 0.4266409266409266, 0.4826254826254826, 0.46332046332046334, 0.4691119691119691, 0.4420849420849421, 0.4874274661508704, 0.46034816247582205, 0.42359767891682787, 0.4332688588007737]
0.001953125
[0.5579150579150579, 0.5984555984555985, 0.6061776061776062, 0.6042471042471043, 0.611969111969112, 0.6081081081081081, 0.6131528046421664, 0.5996131528046421, 0.5783365570599613, 0.5938104448742747]
0.001953125
[0.6525096525096525, 0.6776061776061776, 0.693050193050193, 0.6756756756756757, 0.6872586872586872, 0.6814671814671814, 0.688588007736944, 0.7001934235976789, 0.655705996131528, 0.6789168278529981]
0.001953125




[0.7548262548262549, 0.7722007722007722, 0.7760617760617761, 0.7818532818532818, 0.7702702702702703, 0.7857142857142857, 0.7756286266924565, 0.781431334622824, 0.7330754352030948, 0.7736943907156673]
0.00390625
[0.7953667953667953, 0.8262548262548263, 0.8301158301158301, 0.833976833976834, 0.8166023166023166, 0.832046332046332, 0.8355899419729207, 0.8317214700193424, 0.7756286266924565, 0.8220502901353965]
0.00390625




[0.8416988416988417, 0.8667953667953668, 0.8783783783783784, 0.8648648648648649, 0.8667953667953668, 0.8783783783783784, 0.8762088974854932, 0.8684719535783365, 0.8181818181818182, 0.8626692456479691]
0.00390625
[0.888030888030888, 0.9111969111969112, 0.8996138996138996, 0.9054054054054054, 0.8918918918918919, 0.9015444015444015, 0.9110251450676983, 0.9090909090909091, 0.8607350096711799, 0.8974854932301741]
0.001953125
{'Feature': 'all_complexity_metrics', 'level_2_ancestors': 0.512170377063994, 'p_val_level_2_ancestors': 0.10546875, 'level_3_ancestors': 0.45305108922130194, 'p_val_level_3_ancestors': 0.001953125, 'level_4_ancestors': 0.5971785546253631, 'p_val_level_4_ancestors': 0.001953125, 'level_5_ancestors': 0.6790971822886716, 'p_val_level_5_ancestors': 0.001953125, 'level_6_ancestors': 0.7704756428160684, 'p_val_level_6_ancestors': 0.00390625, 'level_7_ancestors': 0.8199353263183051, 'p_val_level_7_ancestors': 0.00390625, 'level_8_ancestors': 0.8622443111804815, 'p_val_level_8



[0.5135135135135135, 0.5231660231660231, 0.5347490347490348, 0.5328185328185329, 0.5386100386100386, 0.5077220077220077, 0.5319148936170213, 0.504835589941973, 0.5241779497098646, 0.5164410058027079]
0.013671875
[0.3861003861003861, 0.46525096525096526, 0.44015444015444016, 0.4498069498069498, 0.46332046332046334, 0.40733590733590735, 0.5299806576402321, 0.44294003868471954, 0.4874274661508704, 0.44874274661508706]
0.00390625
[0.5617760617760618, 0.5945945945945946, 0.5888030888030888, 0.6023166023166023, 0.5965250965250966, 0.5965250965250966, 0.6150870406189555, 0.5647969052224371, 0.6034816247582205, 0.5783365570599613]
0.001953125
[0.6447876447876448, 0.6698841698841699, 0.693050193050193, 0.6737451737451737, 0.6814671814671814, 0.6814671814671814, 0.6963249516441006, 0.7001934235976789, 0.6479690522243714, 0.6789168278529981]
0.001953125




[0.7625482625482626, 0.777992277992278, 0.777992277992278, 0.7818532818532818, 0.7702702702702703, 0.7857142857142857, 0.781431334622824, 0.7833655705996132, 0.7427466150870407, 0.7678916827852998]
0.001953125




[0.8011583011583011, 0.8262548262548263, 0.832046332046332, 0.832046332046332, 0.8166023166023166, 0.832046332046332, 0.8355899419729207, 0.8336557059961315, 0.7872340425531915, 0.816247582205029]
0.001953125




[0.8436293436293436, 0.8667953667953668, 0.8783783783783784, 0.862934362934363, 0.8667953667953668, 0.8783783783783784, 0.8762088974854932, 0.8704061895551257, 0.8259187620889749, 0.8568665377176016]
0.001953125




[0.88996138996139, 0.9111969111969112, 0.8996138996138996, 0.9034749034749034, 0.8918918918918919, 0.9015444015444015, 0.9110251450676983, 0.9090909090909091, 0.8607350096711799, 0.8916827852998066]
0.001953125
{'Feature': 'incremental_bert_embeddings_layer12_PCA_dims_15', 'level_2_ancestors': 0.5227948589650717, 'p_val_level_2_ancestors': 0.013671875, 'level_3_ancestors': 0.45210600210600205, 'p_val_level_3_ancestors': 0.00390625, 'level_4_ancestors': 0.5902242668200115, 'p_val_level_4_ancestors': 0.001953125, 'level_5_ancestors': 0.6767805799720692, 'p_val_level_5_ancestors': 0.001953125, 'level_6_ancestors': 0.7731805859465435, 'p_val_level_6_ancestors': 0.001953125, 'level_7_ancestors': 0.8212881712881714, 'p_val_level_7_ancestors': 0.001953125, 'level_8_ancestors': 0.8626311583758393, 'p_val_level_8_ancestors': 0.001953125, 'level_9_ancestors': 0.8970217246812991, 'p_val_level_9_ancestors': 0.001953125}


In [4]:
np.save("syntactic_information_analysis_results_ridgeCV.npy", results)

In [5]:
results = np.load("syntactic_information_analysis_results_ridgeCV.npy", allow_pickle = True).tolist()
for i in range(len(results)):
    for k in results[i]:
        if k.startswith("level"):
            results[i][k] *= 100
            results[i][k] = np.round(results[i][k], 2)

np.save("syntactic_information_analysis_results_ridgeCV_formatted",results)

df = pd.DataFrame(results)
df.to_csv("final_syntactic_information_analysis_ridgeCV_results.csv")

In [None]:
# generate table for label distribution
dist = []
label_names = {0: "noun_phrase", 1: "verb_phrase", 2: "adverb_phrase", 3: "adjective_phrase", 4: "prepositional_phrase", 5: "clause", 6: "other"}

for i in range(len(all_y)):
    d = {"level" : y_labels[i]}
    for lab in range(7):
        su = (all_y[i] == lab).sum()
        d[label_names[lab]] = su
        d[label_names[lab] + "(%)"] = su/all_y[i].shape[0]
    dist.append(d)
    
dist_df = pd.DataFrame(dist)
dist_df