In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Targets

In [2]:
words_list = pd.read_csv('./list-words-COHA.txt', skip_blank_lines=True, encoding='windows-1252', delimiter='\t', dtype={'freq':'int64', 'word-cs':'str','PoS':'str','decade':'int8'})

In [3]:
words_list

Unnamed: 0,freq,word-cs,PoS,decade
0,20193,a,at1,1
1,128037,a,at1,2
2,275630,a,at1,3
3,323696,a,at1,4
4,330602,a,at1,5
...,...,...,...,...
3172835,1,zzzz,uh,14
3172836,1,zzzz,uh,19
3172837,3,zzzz,uh,20
3172838,1,zzzzzz,np1,18


In [4]:
# We want to keep decades 9 to 19
word_groups = words_list.groupby('word-cs')
index_to_keep = []
for word in tqdm(word_groups.groups):
    group = word_groups.get_group(word)
    group = group[ (group.decade > 8) & (group.decade < 20)]
    if len(group)>=11:
        subgroups = group.groupby('PoS')
        for pos in subgroups.groups:
            subgroup = subgroups.get_group(pos)
            if len(subgroup) == 11:
                index_to_keep += [i for i in subgroup.index]

100%|██████████| 401208/401208 [08:39<00:00, 772.85it/s]


In [5]:
words_list = words_list.loc[index_to_keep]

In [6]:
words_list.head(22)

Unnamed: 0,freq,word-cs,PoS,decade
8,417184,a,at1,9
9,455292,a,at1,10
10,470853,a,at1,11
11,544956,a,at1,12
12,517510,a,at1,13
13,517316,a,at1,14
14,538866,a,at1,15
15,531264,a,at1,16
16,532326,a,at1,17
17,567437,a,at1,18


In [9]:
# Following CLAWS7 tags set
existing_pos = list(words_list.value_counts('PoS').index)

In [14]:
# common nouns only, no proper nouns
noun_tags = [ 'nd1', 'nn', 'nn1', 'nn2', 'nna', 'nnb', 'nnl1', 'nnl2', 
            'nno', 'nno2', 'nnt1', 'nnt2', 'nnu', 'nnu1', 'nnu2' ]
# adj tags
adj_tags = [ 'jj', 'jjr', 'jjt', 'jk' ]

# be, has, do and modals are excluded => only lexical verbs
verb_tags = [ 'vv0', 'vvd', 'vvg', 'vvgk' , 'vvi', 'vvn', 'vvnk' , 'vvz']

target_tags = noun_tags + adj_tags + verb_tags

In [76]:
target_list = words_list[words_list.PoS.isin(target_tags)] # Nouns, adjs or verbs
target_list = target_list[ target_list['word-cs'].str.len() > 1 ] # More than one letter
target_list

Unnamed: 0,freq,word-cs,PoS,decade
19721,26,a-comin,vvg,9
19722,31,a-comin,vvg,10
19723,26,a-comin,vvg,11
19724,31,a-comin,vvg,12
19725,21,a-comin,vvg,13
...,...,...,...,...
3172226,16,zulus,nn2,15
3172227,6,zulus,nn2,16
3172228,11,zulus,nn2,17
3172229,21,zulus,nn2,18


In [77]:
mean_freq_threshold = 1

target_groups = target_list.groupby('word-cs')
targets_to_keep = []
for candidate, indices in tqdm(target_groups.groups.items()):
    if target_groups.get_group(candidate).freq.agg(['mean'])['mean'] > mean_freq_threshold:
        targets_to_keep += list(indices)

100%|██████████| 38574/38574 [00:39<00:00, 988.65it/s] 


In [78]:
target_list = target_list.loc[targets_to_keep]
target_list

Unnamed: 0,freq,word-cs,PoS,decade
19721,26,a-comin,vvg,9
19722,31,a-comin,vvg,10
19723,26,a-comin,vvg,11
19724,31,a-comin,vvg,12
19725,21,a-comin,vvg,13
...,...,...,...,...
3172226,16,zulus,nn2,15
3172227,6,zulus,nn2,16
3172228,11,zulus,nn2,17
3172229,21,zulus,nn2,18


In [79]:
target_list.loc[ (target_list['PoS'].isin(noun_tags)), 'PoS' ]= 'N'
target_list.loc[ (target_list['PoS'].isin(adj_tags)), 'PoS' ]= 'A'
target_list.loc[ (target_list['PoS'].isin(verb_tags)), 'PoS' ]= 'V'
target_list

Unnamed: 0,freq,word-cs,PoS,decade
19721,26,a-comin,V,9
19722,31,a-comin,V,10
19723,26,a-comin,V,11
19724,31,a-comin,V,12
19725,21,a-comin,V,13
...,...,...,...,...
3172226,16,zulus,N,15
3172227,6,zulus,N,16
3172228,11,zulus,N,17
3172229,21,zulus,N,18


In [80]:
target_list.value_counts('PoS')//11

PoS
N    23016
V    17877
A     9892
dtype: int64

In [81]:
decades = np.array(range(1890,2000,10))
target_list.decade = decades[target_list['decade']-9]
target_list

Unnamed: 0,freq,word-cs,PoS,decade
19721,26,a-comin,V,1890
19722,31,a-comin,V,1900
19723,26,a-comin,V,1910
19724,31,a-comin,V,1920
19725,21,a-comin,V,1930
...,...,...,...,...
3172226,16,zulus,N,1950
3172227,6,zulus,N,1960
3172228,11,zulus,N,1970
3172229,21,zulus,N,1980


In [86]:
target_list['Total'] = target_list.groupby(['word-cs', 'PoS', 'decade'])['freq'].transform('sum')
target_list = target_list.drop_duplicates(subset=['word-cs', 'PoS', 'decade'])
target_list

Unnamed: 0,freq,word-cs,PoS,decade,Total
19721,26,a-comin,V,1890,26
19722,31,a-comin,V,1900,31
19723,26,a-comin,V,1910,26
19724,31,a-comin,V,1920,31
19725,21,a-comin,V,1930,21
...,...,...,...,...,...
3172226,16,zulus,N,1950,16
3172227,6,zulus,N,1960,6
3172228,11,zulus,N,1970,11
3172229,21,zulus,N,1980,21


In [3]:
target_list = pd.read_csv('candidates_target_list.csv',delimiter='\t')
target_list

Unnamed: 0,freq,word-cs,PoS,decade,Total
0,26,a-comin,V,1890,26
1,31,a-comin,V,1900,31
2,26,a-comin,V,1910,26
3,31,a-comin,V,1920,31
4,21,a-comin,V,1930,21
...,...,...,...,...,...
499450,16,zulus,N,1950,16
499451,6,zulus,N,1960,6
499452,11,zulus,N,1970,11
499453,21,zulus,N,1980,21


In [25]:
target_groups = target_list.groupby('word-cs')
for word, indices in tqdm(target_groups.groups.items()):
    group = target_groups.get_group(word)
    pos_groups = group.groupby('PoS')
    if len(pos_groups.groups)>1:
        # More than 1 PoS Tag
        medians = pos_groups.freq.agg(['median'])
        max_tag = medians.index[np.argmax(medians)]
        target_list.loc[ indices, 'PoS' ] = max_tag

target_list['Total'] = target_list.groupby(['word-cs', 'PoS', 'decade'])['freq'].transform('sum')
target_list = target_list.drop_duplicates(subset=['word-cs', 'PoS', 'decade'])
target_list

100%|██████████| 38574/38574 [01:32<00:00, 416.03it/s]


Unnamed: 0,freq,word-cs,PoS,decade,Total
0,26,a-comin,V,1890,26
1,31,a-comin,V,1900,31
2,26,a-comin,V,1910,26
3,31,a-comin,V,1920,31
4,21,a-comin,V,1930,21
...,...,...,...,...,...
499450,16,zulus,N,1950,16
499451,6,zulus,N,1960,6
499452,11,zulus,N,1970,11
499453,21,zulus,N,1980,21


In [29]:
target_list.drop(['Total'],axis=1).to_csv('./candidates_target_list.csv', index=False,sep='\t')

In [31]:
target_list.value_counts('PoS')//11

PoS
N    21208
V     9250
A     8116
dtype: int64

# Contexts

In [4]:
words_list = pd.read_csv('./list-words-COHA.txt', skip_blank_lines=True, encoding='windows-1252', delimiter='\t', dtype={'freq':'int64', 'word-cs':'str','PoS':'str','decade':'int8'})

In [10]:
# We want to keep decades 9 to 19
decade_groups = words_list.groupby('decade')
dec1890 = decade_groups.get_group(9)
dec1990 = decade_groups.get_group(19)

freqs_1890 = dec1890.groupby('word-cs').freq.agg(['sum']).sort_values('sum',ascending=False)
freqs_1990 = dec1990.groupby('word-cs').freq.agg(['sum']).sort_values('sum',ascending=False)

In [30]:
i = 3000
contexts = set(freqs_1890.index[:i]) | set(freqs_1990.index[:i])

while len(contexts) < 5000:
    i += 1
    contexts = set(freqs_1890.index[:i]) | set(freqs_1990.index[:i])
print('Final number of items per decade : ',i)
print('Final number of contexts :',len(contexts))
print('Intersection size : ', len(set(freqs_1890.index[:i]) & set(freqs_1990.index[:i])))

Final number of items per decade :  3881
Final number of contexts : 5000
Intersection size :  2762


In [34]:
context_list = list(contexts)
context_list.sort()

with open( './contexts_list.txt','w',encoding='utf-8' ) as f:
    f.write( '\n'.join(context_list) )