In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import sys
sys.path.append('../src')
from utils import *

In [2]:
# import the data
df = pd.read_csv('../data/cleaned_data.csv',index_col=[0])
df = df.drop_duplicates()
df = df[df['clf_id'].astype('int')<df['clf_gov2_id'].astype('int')]

df_clf_noun =  df[df['clf_id'] == df['clf_gov2_id'] - 1].reset_index()
df_clf_mod_noun =  df[df['clf_id'] != df['clf_gov2_id'] - 1].reset_index()

#### frequency comparisons between two structures

In [4]:
nounFreq = pd.read_pickle("../data/leipzig.noun.pkl" )
nouns1 = df_clf_noun['clf_gov2_form'].unique()
nouns2 = df_clf_mod_noun['clf_gov2_form'].unique()

df1 = pd.DataFrame(list(nouns1),columns=['noun'])
df1['freq'] = df1.noun.map(nounFreq)
df1['log_freq'] = np.log(df1['freq'])
df1['type'] = 'clf_noun'

df2 = pd.DataFrame(list(nouns2),columns=['noun'])
df2['freq'] = df2.noun.map(nounFreq)
df2['log_freq'] = np.log(df2['freq'])
df2['type'] = 'clf_mod_noun'

f_value, p_value = stats.f_oneway(df1['freq'].values,df2['freq'].values)
p_value_one_sided = p_value /2
print(f"The f-value is {round(f_value,2)} and the p-value is {round(p_value_one_sided,3)}.")

The f-value is 3.69 and the p-value is 0.027.


#### MI comparisons between two structures

In [10]:
nouns3 = list(df_clf_noun['clf_gov2_form'])
nouns4 = list(df_clf_mod_noun['clf_gov2_form'])

nouns3_ent = entropy(nouns3)
nouns4_ent = entropy(nouns4)

nouns3_clf_counter = df_clf_noun.groupby('clf_form')['clf_gov2_form'].apply(list).to_dict()
nouns4_clf_counter = df_clf_mod_noun.groupby('clf_form')['clf_gov2_form'].apply(list).to_dict()

nouns3_clf_counter = {key: Counter(val) for key, val in nouns3_clf_counter.items()}
nouns4_clf_counter = {key: Counter(val) for key, val in nouns4_clf_counter.items()}

clf_noun_cond_ent = cond_entropy(nouns3_clf_counter)
clf_mod_noun_cond_ent = cond_entropy(nouns4_clf_counter)

clf_noun_cond_ent_val = list(clf_noun_cond_ent.values())
clf_mod_noun_cond_ent_val = list(clf_mod_noun_cond_ent.values())

clf_noun_mi = [nouns3_ent - cond_ent for cond_ent in clf_noun_cond_ent_val]
clf_mod_noun_mi = [nouns4_ent - cond_ent for cond_ent in clf_mod_noun_cond_ent_val]

In [26]:
f_value, p_value = stats.f_oneway(clf_noun_mi,clf_mod_noun_mi)
print(f'The f_value of ANOVA us {round(f_value,2)}, and the p_value is {round(p_value,3)}')
print(f'The mean MI from the clf_noun structure is {round(np.mean(clf_noun_mi),2)},and that from clf_mod_noun structure is {round(np.mean(clf_mod_noun_mi),2)}.')

The f_value of ANOVA us 29.42, and the p_value is 0.0
The mean MI from the clf_noun structure is 7.91,and that from clf_mod_noun structure is 6.79.
