In [1]:
import pickle
from tqdm import tqdm
import pandas as pd
import numpy as np
from params import DECADES, WORDS_FOLDER, COHA_FREQ_FILE, DECADES_INDS, MIN_FREQ, MIN_LENGTH, HAMILTON_SGNS_FOLDER, SGNS_FOLDER
from postags import replace_posTag

In [2]:
word_list = pd.read_csv(f'{WORDS_FOLDER}/{COHA_FREQ_FILE}', skip_blank_lines=True, encoding='utf-8', delimiter='\t', dtype={'freq':'int64', 'word-cs':'str','PoS':'str','decade':'int8'})
word_list = word_list.rename({"word-cs":"word"},axis='columns')

In [3]:
len(word_list)

3172840

In [4]:
# Remove nan
word_list = word_list.loc[word_list['word'].isnull() == False]
word_list = word_list.loc[word_list['PoS'].isnull() == False]
# remove useless decades
word_list = word_list.loc[word_list['decade'].isin(DECADES_INDS)]

In [5]:
len(word_list)

2047597

In [6]:
print('Creating targets list...')
targets = word_list.copy()
# Remove words that are not from the selected PoS
print('Merging PoS tag and filtering...')
targets['PoS'] = list(map(replace_posTag,targets['PoS']))
targets = targets.loc[targets['PoS'].isnull() == False] # Remove none
targets = targets.groupby(['word','PoS','decade'])['freq'].sum().reset_index() # Merge duplicate
print('Nb of unique (word,PoS) after PoS selection : ', len(targets.groupby(['word','PoS'])))

# Remove words that are too small
print('Filtering with word length...')
targets = targets.loc[targets['word'].str.len() >= MIN_LENGTH]
print('Nb of unique (word,PoS) after Minimum Length : ', len(targets.groupby(['word','PoS'])))

# Remove words that are not frequent enough in each decade
print('Filtering with minimum frequency per decade...')
to_keep = []
wordpos_groups = targets.groupby(['word','PoS'])
for wordpos, group in tqdm(wordpos_groups, desc='Filtering candidate (word,pos)'):
    if group['freq'].min()>= MIN_FREQ and len(group)==len(DECADES_INDS):
        to_keep += list(group.index)
targets = targets.loc[to_keep]
print('Nb of unique (word,PoS) after Minimum Freq : ', len(targets.groupby(['word','PoS'])))

# targetfname = f"{WORDS_FOLDER}/{TRGT_FILE}"
# targets.to_csv(targetfname, index=False, sep='\t')
# print('Targets list created as '+targetfname)
print('Final number of targets (unique (word, pos) ) : ', len(targets.groupby(['word','PoS'])))
print('Nb of targets per pos : ')
for pos, group in targets.groupby('PoS'):
    print(pos, ':', len(group.groupby('word')) )

Creating targets list...
Merging PoS tag and filtering...
Nb of unique (word,PoS) after PoS selection :  323847
Filtering with word length...
Nb of unique (word,PoS) after Minimum Length :  323252
Filtering with minimum frequency per decade...


Filtering candidate (word,pos): 100%|██████████| 323252/323252 [00:16<00:00, 19121.91it/s]


Nb of unique (word,PoS) after Minimum Freq :  33890
Final number of targets (unique (word, pos) ) :  33890
Nb of targets per pos : 
ADJ : 7130
N : 16571
V : 10189


In [7]:
targets['HamiltonSGNS_ind'] = -1

In [8]:
targets

Unnamed: 0,word,PoS,decade,freq,HamiltonSGNS_ind
726,a-goin,V,9,86,-1
727,a-goin,V,10,120,-1
728,a-goin,V,11,75,-1
729,a-goin,V,12,171,-1
730,a-goin,V,13,45,-1
...,...,...,...,...,...
1455943,zulus,N,15,16,-1
1455944,zulus,N,16,6,-1
1455945,zulus,N,17,11,-1
1455946,zulus,N,18,21,-1


In [9]:
vocab2ind_per_dec = dict()
for dec_ind,dec in zip(DECADES_INDS,DECADES):
    with open(f'{HAMILTON_SGNS_FOLDER}/{dec}-vocab.pkl','rb') as f_pkl:
        vocab = pickle.load(f_pkl)
    vocab2ind_per_dec[dec_ind] = dict(zip(vocab,range(len(vocab))))

In [10]:
for row_i, row in tqdm(targets.iterrows()):
    try : 
        targets.loc[row_i,'HamiltonSGNS_ind'] = vocab2ind_per_dec[row.decade][row.word]
    except KeyError as exc:
        pass

372790it [00:25, 14356.34it/s]


In [11]:
targets = targets[targets.HamiltonSGNS_ind >= 0]

# Append the relative frequency column ( freq / decade_total )
decade_sum = targets.groupby(['decade'])['freq'].sum()
targets['rel_freq'] = [ row['freq'] / decade_sum[row['decade']] for _, row in tqdm(targets.iterrows(), desc='Computing relative freq') ]


targets

Computing relative freq: 359271it [00:09, 38301.69it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targets['rel_freq'] = [ row['freq'] / decade_sum[row['decade']] for _, row in tqdm(targets.iterrows(), desc='Computing relative freq') ]


Unnamed: 0,word,PoS,decade,freq,HamiltonSGNS_ind,rel_freq
2525,abandon,N,9,38,3937,5.250624e-06
2526,abandon,N,10,63,4045,8.118498e-06
2527,abandon,N,11,57,4407,7.150863e-06
2528,abandon,N,12,72,4505,7.892489e-06
2529,abandon,N,13,56,4433,6.357692e-06
...,...,...,...,...,...,...
1455943,zulus,N,15,16,36343,1.820371e-06
1455944,zulus,N,16,6,33160,6.998263e-07
1455945,zulus,N,17,11,37980,1.297198e-06
1455946,zulus,N,18,21,46611,2.321766e-06


In [12]:
for pos, group in targets.groupby('PoS'):
    print(pos, ':', len(group.groupby('word')) )

ADJ : 6453
N : 16135
V : 10073


In [20]:
targets.to_csv(f'{WORDS_FOLDER}/target_words.csv', index=False, sep='\t')

In [13]:
ind_to_retrieve_per_pos_per_dec = dict()
word_lists_per_pos = dict()

pos_groups = targets.groupby('PoS')

for pos in pos_groups.groups.keys():
    ind_to_retrieve_per_dec = dict()
    
    dec_groups = pos_groups.get_group(pos).groupby('decade')
    word_lists_per_pos[pos] = list(dec_groups.get_group(DECADES_INDS[0]).sort_values(['word'])['word'].values)

    for dec in DECADES_INDS:
        ind_to_retrieve_per_dec[dec] = dec_groups.get_group(dec).sort_values(['word'])['HamiltonSGNS_ind'].values
    
    ind_to_retrieve_per_pos_per_dec[pos] = ind_to_retrieve_per_dec


In [17]:
for dec_i, dec in tqdm(zip(DECADES_INDS, DECADES),total=len(DECADES),desc='Generating matrices per decade'):
    mat = np.load(f'{HAMILTON_SGNS_FOLDER}/{dec}-w.npy')
    for pos, ind_dec_dict in ind_to_retrieve_per_pos_per_dec.items():
        indices = ind_dec_dict[dec_i]
        pos_mat = mat[indices]
        np.save(file=f'{SGNS_FOLDER}/{dec}_{pos}.npy',arr=pos_mat)

Generating matrices per decade: 100%|██████████| 11/11 [00:01<00:00, 10.03it/s]


In [20]:
for pos, word_list in word_lists_per_pos.items():
    with open(f'{WORDS_FOLDER}/{pos}_list.pkl','wb') as f:
        pickle.dump(file=f,obj=word_list)
    print(pos,':',len(word_list))

ADJ : 6453
N : 16135
V : 10073
