In [1]:
from dcwetk.cwe_distance import *
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from frequency_grapher import *
import glob
from tqdm.notebook import tqdm
import pickle
import pandas as pd
from sklearn.cluster import AffinityPropagation
from nltk.probability import FreqDist
import random

In [2]:
# get words by number of part of speech
vocab = dict()

for name in tqdm(glob.glob('byp_docs_with_relational_embeddings/*.pickle')):
    yr = int(name[-11:-7])
    
    if 1880 <= yr <= 1950:
        with open(name, 'rb') as f:
            docs = pickle.load(f)
            f.close()
            
        for doc in docs:
            for sent in doc:
                for tok in sent:
                    for lem in tok:
                        if lem.lemma not in vocab.keys():
                            vocab[lem.lemma] = set()
                        vocab[lem.lemma].add(lem.pos_tag)
                            
        del docs

  0%|          | 0/112 [00:00<?, ?it/s]

In [3]:
# convert to list
vocab_posfreq = [(tok, len(pos)) for tok, pos in vocab.items() if len(pos) >= 2]

# take random sample - overestimate to account for WUMs that are too small to cluster
vocab_sample = random.sample(vocab_posfreq, 1000)

In [4]:
# get WUMs
embeddings = {}
postags = {}
for name in tqdm(glob.glob('byp_docs_with_relational_embeddings/*.pickle')):
    yr = int(name[-11:-7])
    
    if 1880 <= yr < 1950:
        with open(name, 'rb') as f:
            docs = pickle.load(f)
            f.close()
            
        for doc in docs:
            for sent in doc:
                for tok in sent:
                    for lem in tok:
                        if lem.lemma in [i[0] for i in vocab_sample]:
                            if lem.lemma not in embeddings.keys():
                                embeddings[lem.lemma] = []
                            if lem.lemma not in postags.keys():
                                postags[lem.lemma] = []
                                
                            embeddings[lem.lemma].append(lem.embedding)
                            postags[lem.lemma].append(lem.pos_tag)
                            
        print(yr)
        print(len(embeddings))
        print()
                            
        del docs

  0%|          | 0/112 [00:00<?, ?it/s]

1880
173

1881
214

1882
221

1883
249

1884
311

1885
339

1886
350

1887
377

1888
390

1889
404

1890
409

1891
447

1892
484

1893
513

1894
532

1895
542

1896
562

1897
576

1898
590

1899
627

1900
643

1901
669

1902
691

1903
714

1904
724

1905
761

1906
765

1907
776

1908
793

1909
820

1910
836

1911
848

1912
855

1913
872

1914
885

1915
887

1916
888

1917
893

1918
897

1919
910

1920
932

1921
933

1922
943

1923
952

1924
958

1925
961

1926
972

1927
973

1928
976

1929
981

1930
987

1931
989

1932
991

1933
992

1934
992

1935
996

1936
996

1937
996

1938
996

1939
996

1940
998

1941
998

1944
998

1946
999

1948
1000

1949
1000



In [5]:
# clean and reformat data
embeddings_list = [(tok, embed) for tok, embed in embeddings.items()]
postags_list = [(tok, postag) for tok, postag in postags.items()]
data = [(tok, embed, postag) for ((tok, embed), (_, postag)) in zip(embeddings_list, postags_list)]
data_cleaned = []
for tok, embed, postag in data:
    if len(embed) >= 128:
        if len(embed) > 1024:
            data_sampled = random.sample(list(zip(embed, postag)), 1024)
            new_embed, new_postag = zip(*data_sampled)
            data_cleaned.append((tok, new_embed, new_postag))
        else:
            data_cleaned.append((tok, embed, postag))

In [22]:
# make clusters
AP = AffinityPropagation(max_iter=250, random_state=10)
ap = lambda i: AP.fit_predict(i)
clustered_data = [(tok, embed, postag, ap(embed)) for tok, embed, postag in tqdm(data_cleaned)]

  0%|          | 0/125 [00:00<?, ?it/s]



In [35]:
# filter out clusters that failed to converge
cdata_cleaned = [i for i in clustered_data if len(set(i[3])) > 1]
print(len(cdata_cleaned))

118


In [36]:
# load into dfs
dfs = {}
for tok, embed, postag, cluster in cdata_cleaned:
    x, y = zip(*PCA(n_components=2).fit_transform(embed))
    dfs[tok] = pd.DataFrame({'x': x, 'y': y, 'pos': postag, 'cluster': cluster})

In [85]:
# make master df with info
meta_df = {'tok': [], 'n_clusters': [], 'n_pos': [], 'pos_list': [], 'n_homogeneous_clusters': []}
for tok, df in tqdm(dfs.items()):
    meta_df['tok'].append(tok)
    meta_df['n_clusters'].append(len(df['cluster'].unique()))
    meta_df['n_pos'].append(len(df['pos'].unique()))
    # calculate n of homogenous clusters
    count = 0
    for cl in df['cluster'].unique():
        # filter by cluster
        dff = df[df['cluster'] == cl]
        # if length of dff['pos'] == 1:
        if len(dff['pos']) == 1:
            count += 1
            
    meta_df['n_homogeneous_clusters'].append(count)
    meta_df['pos_list'].append('|'.join(list(df['pos'].unique())))
    
mdf = pd.DataFrame(meta_df)
mdf['accuracy'] = mdf.apply(lambda i: i['n_homogeneous_clusters'] / i['n_clusters'], axis=1)
mdf.sort_values(by='n_pos', inplace=True, axis=0, ascending=False)

  0%|          | 0/118 [00:00<?, ?it/s]

In [86]:
from scipy.stats import spearmanr, pearsonr

print('SPEARMAN')
corr, p = spearmanr(mdf['n_pos'], mdf['accuracy'])
print('Correlation coefficient: ' + str(np.round(corr, 2)))
print('PVal: ' + str(np.round(p, 2)))

SPEARMAN
Correlation coefficient: 0.17
PVal: 0.07


In [89]:
print(np.round(sum(mdf['n_homogeneous_clusters']) / sum(mdf['n_clusters']), 2))

0.09
