In [1]:
# public package imports
import pickle
import numpy as np
from tqdm.notebook import tqdm
from nltk.probability import FreqDist
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import spearmanr as spearman
import glob
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AffinityPropagation as AP
from scipy.spatial.distance import jensenshannon
from sklearn.preprocessing import normalize
import random
# proprietary imports
from dcwetk.cwe_distance import *
from dcwetk.cwe_distance.cwe_frame import *

In [2]:
class dError(Exception):
    pass

def makeDistance(prevWUMs, currWUMs):
    failed_jsd = 0
    prevVocab, currVocab = set(prevWUMs.keys()), set(currWUMs.keys())
    
    toks_list = set.intersection(prevVocab, currVocab)
    
    if toks_list:
        distData = dict()
        distData['token'] = list(toks_list)
        for i in ['prt', 'div', 'jsd', 'apd']:
            distData[i] = []
            
        failures = 0
        
        for tok in tqdm(distData['token']):
            prevWUM = prevWUMs[tok]
            currWUM = currWUMs[tok]
            
            # compute PRT
            distData['prt'].append(prevWUM.prt(currWUM))
            
            # compute DIV
            distData['div'].append(prevWUM.div(currWUM))
            
            # compute JSD
            jsdval = prevWUM.jsd(currWUM, max_sample_size=1024)
            if jsdval > 0 and jsdval <= 1:
                distData['jsd'].append(prevWUM.jsd(currWUM, max_sample_size=1024))
            else:
                failed_jsd += 1
                distData['jsd'].append(np.nan)
            
            # compute APD
            distData['apd'].append(prevWUM.apd(currWUM, max_sample_size=128))
        
        return pd.DataFrame(distData)
    
    else:
        raise dError
    
    return distances

In [3]:
def loadWUMs(yr):
    fileName = 'byp_decade_wums/' + str(yr) + '_' + str(yr + 5) + '_wums.pickle'
    with open(fileName, 'rb') as f:
        return pickle.load(f)

yearRange = list(range(1880, 1950, 5))
for i in tqdm(range(1, len(yearRange))):
    prevYear = yearRange[i-1]
    currYear = yearRange[i]
    
    prevWUMs = loadWUMs(prevYear)
    currWUMs = loadWUMs(currYear)
    
    dist = makeDistance(prevWUMs, currWUMs)
    
    with open('ddists_complete/distances_' + str(prevYear) + '.pickle', 'wb') as f:
        pickle.dump(dist, f, protocol=pickle.HIGHEST_PROTOCOL)

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/1990 [00:00<?, ?it/s]



  0%|          | 0/2256 [00:00<?, ?it/s]



  0%|          | 0/3627 [00:00<?, ?it/s]



  0%|          | 0/4427 [00:00<?, ?it/s]



  0%|          | 0/5440 [00:00<?, ?it/s]



  0%|          | 0/5662 [00:00<?, ?it/s]



  0%|          | 0/4454 [00:00<?, ?it/s]



  0%|          | 0/4636 [00:00<?, ?it/s]



  0%|          | 0/4736 [00:00<?, ?it/s]



  0%|          | 0/4221 [00:00<?, ?it/s]



  0%|          | 0/3367 [00:00<?, ?it/s]



  0%|          | 0/1772 [00:00<?, ?it/s]



  0%|          | 0/1156 [00:00<?, ?it/s]

