In [1]:
import pickle 
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.utils import shuffle
import sys
sys.path.append('../src')
from utils import *

#### Import data

In [3]:
# Import cleaned data
df = pd.read_csv('../data/cleaned_data.csv',index_col=[0])
df = df.drop_duplicates()
df = df[df['clf_id'].astype('int')<df['clf_gov2_id'].astype('int')]
print(df.shape)
unique_num_noun = len(df['clf_gov2_form'].unique())
print(f"The number of unique nouns is {unique_num_noun}.") 

(760575, 11)
The number of unique nouns is 44620.


In [4]:
# split the data into two structures: df1: clf_noun_structure; df2: clf_mod_noun_structure
df1 = df[df['clf_id']==df['clf_gov2_id']-1].reset_index()
df2 = df[df['clf_id'] != df['clf_gov2_id'] - 1].reset_index()

#### frequency comparison

#### conditional entropy comparison

#### Filter data

In [5]:
# import the freq information
with open("../data/leipzig.noun.pkl",'rb') as file:
    nounFreq = pickle.load(file)

In [6]:
# create dataframe of modified nouns for two scenarios: clf_noun_structure and clf_mod_noun_structure
df1_nounFreq = pd.DataFrame(list(df1.clf_gov2_form.unique()),columns=['noun'])
df1_nounFreq['freq'] = df1_nounFreq.noun.map(nounFreq)
df2_nounFreq = pd.DataFrame(list(df2.clf_gov2_form.unique()),columns=['noun'])
df2_nounFreq['freq'] = df2_nounFreq.noun.map(nounFreq)

# remove nouns that are less than or equal to 25 in frequency
df1_nounFreq = df1_nounFreq[df1_nounFreq['freq']>25]
df2_nounFreq = df2_nounFreq[df2_nounFreq['freq']>25]

#### Sample nouns based on their frequency bins

In [7]:
df1_nounFreq['log_freq'] = np.log(df1_nounFreq['freq'])
df1_nounFreq['bins'] = pd.cut(df1_nounFreq['log_freq'],bins=30)
df1_sample = df1_nounFreq.groupby('bins').apply(lambda x: x.sample(frac = 0.01,replace=False, random_state=1)).reset_index(drop=True)

df2_nounFreq['log_freq'] = np.log(df2_nounFreq['freq'])
df2_nounFreq['bins'] = pd.cut(df2_nounFreq['log_freq'],bins=30)
df2_sample = df2_nounFreq.groupby('bins').apply(lambda x: x.sample(frac = 0.01,replace=False, random_state=1)).reset_index(drop=True)

#### Generate noun pairs
##### clf_noun structure

In [8]:
noun_ls1 = df1_sample['noun']
pair_ls1 = (list(combinations(noun_ls1,2)))

pair_df1 = pd.DataFrame(pair_ls1).rename(columns={0:'noun1',1:'noun2'})
a_dict1 = {key:val for key,val in zip(df1_sample['noun'],df1_sample['freq'])}

pair_df1['noun1_freq'] = pair_df1['noun1'].map(a_dict1)
pair_df1['noun2_freq'] = pair_df1['noun2'].map(a_dict1)

pair_df1['noun1_log'] = np.log(pair_df1['noun1_freq'])
pair_df1['noun2_log'] = np.log(pair_df1['noun2_freq'])
pair_df1 = shuffle(pair_df1).reset_index(drop=True)

pair_df1 = balancedFreq_n1_n2(pair_df1)

##### clf_mod_noun structure

In [9]:
noun_ls2 = df2_sample['noun']
pair_ls2 = (list(combinations(noun_ls2,2)))

pair_df2 = pd.DataFrame(pair_ls2).rename(columns={0:'noun1',1:'noun2'})
a_dict2 = {key:val for key,val in zip(df2_sample['noun'],df2_sample['freq'])}

pair_df2['noun1_freq'] = pair_df2['noun1'].map(a_dict2)
pair_df2['noun2_freq'] = pair_df2['noun2'].map(a_dict2)

pair_df2['noun1_log'] = np.log(pair_df2['noun1_freq'])
pair_df2['noun2_log'] = np.log(pair_df2['noun2_freq'])
pair_df2 = shuffle(pair_df2).reset_index(drop=True)

pair_df2 = balancedFreq_n1_n2(pair_df2)

#### similarities 

In [10]:
pair_df1 = similarity(pair_df1,noun_ls1)
pair_df2 = similarity(pair_df2,noun_ls2)

  dist = 1.0 - uv / np.sqrt(uu * vv)


#### PMI
##### clf_noun structure

##### clf_mod_structure

#### class membership

In [25]:
pair_df1 = class_mem_calculator(pair_df1,df,noun_ls1)
pair_df2 = class_mem_calculator(pair_df2,df,noun_ls2)