We need to make the cooccurrence matrices to then calculate the pointwise mutual information (PMI) values to pass to the Dynamic Word Embedding model. 

In [1]:
from csv import DictReader
from collections import Counter

# read in the freq counter
totalFreqDistFile = '/Users/bkitano/Desktop/Classes/spring_2019/thesis/embeddings/totalFreqDist.csv'

with open(totalFreqDistFile) as f:
    reader = DictReader(f, fieldnames=['word', 'count'])
    freqDist = Counter({row['word']: int(row['count']) for row in reader})
            

In [2]:
print(sum(freqDist.values()))
# 463,231,888 words

463231888


In [3]:
frequentWords = [w for w in freqDist.keys() if freqDist[w] > 200]
# 71670, 3x than paper

In [4]:
print(len(frequentWords))
print(frequentWords[0:10])

71670
['god', 'one', 'upon', 'may', 'shall', 'us', 'man', 'would', 'great', 'men']


In [5]:
# create and save hashmap of word to ids
wordToID = dict(zip(frequentWords, range(len(freqDist))))

In [6]:
IDToWord = dict([(wordToID[word], word) for word in wordToID.keys()])

In [61]:
import numpy as np
from scipy.sparse import dok_matrix, csr_matrix
from multiprocessing import Pool

def cooccurDummy( args, wordMap, V ):

    word = args[0] 
    window = args[1] 
    df = dok_matrix((V, V))
    for coword in window:
        try:
            wordID = wordMap[word]
            cowordID = wordMap[coword]
            df[wordID, cowordID] += 1
        except:
            print(word)
    csr = df.tocsr()
    return csr

def cooccur(args):
    return cooccurDummy(args, wordToID, len(frequentWords))

def parallelF(token):
    try:
        wordToID[token]
        return token
    except:
        return None
    
def parallelFilter(tokens, p):
    with Pool(processes = p) as pool:
        results = pool.map(parallelF, tokens)
        pool.close()
        pool.join()
    return results

In [19]:
def sum_sparse(m):
    x = np.zeros(m[0].shape)
    for a in m:
        ri = np.repeat(np.arange(a.shape[0]),np.diff(a.indptr))
        x[ri,a.indices] += a.data
    return x

In [17]:
import nltk
import os
from os import listdir
from os.path import isfile, join

textPath = '/Users/bkitano/Desktop/Classes/Spring_2019/thesis/corpus/cleaned_txt/'

textFiles = [f for f in listdir(textPath) if isfile(join(textPath, f))]
cooccurrencePath = '/Users/bkitano/Desktop/Classes/spring_2019/thesis/embeddings/cooccur/'
L = 5 # window size, or L in the paper
# window: x1 x2 ... xL W y1 y2 ... yL

# https://stackoverflow.com/questions/17458751/python-symmetric-word-matrix-using-nltk
# need to parallelize

with open(textPath + textFiles[45], 'r+') as f:
    text = f.read().split('\n')[0]
    # need to parallelize it
    tokens = [t for t in parallelFilter(nltk.word_tokenize(text), 5) if t is not None ]

In [18]:
wordTarget = tokens[L:len(tokens) - L]
windowTarget = [tokens[i-L:i + L + 1] for i in range(L, len(tokens) - L)]

pairs = list(zip(wordTarget, windowTarget))

In [20]:
import scipy.sparse as sp
rows, cols, vals = [], [], []
for word, window in pairs:
    for coword in window:
        if wordToID.get(word) is not None:
            rows.append(wordToID[word])
            cols.append(wordToID[coword])
            vals.append(1)
X = sp.csr_matrix((vals, (rows, cols)))