In [1]:
import torch
from scipy.sparse import csr_matrix
import numpy as np
import time
import math
import pickle

In [2]:
with open('./wordToID.p', 'rb') as fp:
    WordToID = pickle.load(fp)

with open('./IDtoWord.p', 'rb') as fp:
    IDToWord = pickle.load(fp)
    
with open('./quintilenorm/cooccurBatch1.p', 'rb') as fp:
    X = pickle.load(fp)

In [3]:
f = len(WordToID)
print(len(WordToID))
print(len(IDToWord))
print(X.shape)

36343
36343
(36343, 36343)


In [4]:
# clear the diagonals and replace with total counts
a = time.time()
V = torch.from_numpy(X.todense()).float()
mask = torch.ones_like(V) - torch.diag(torch.ones(f))
V = torch.mul(V, mask)
S = torch.mv(V, torch.ones(f))
D = torch.diag(S)
V.add_(D)
print("Set up. Time elapsed: {}".format(time.time() - a))

Set up. Time elapsed: 13.468387842178345


In [5]:
print(V[0:3, 0:3])


 1.8308e+05  6.6100e+02  5.6200e+02
 6.6100e+02  1.0750e+05  4.6900e+02
 5.6300e+02  4.6900e+02  1.2196e+05
[torch.FloatTensor of size 3x3]



In [6]:
# okay, now V is in the form that we'd expect.
print("logXij")
a = time.time()
logXij = torch.log1p(V)
print(time.time() - a)

logXij
1.1638164520263672


In [7]:
print(logXij[0:3, 0:3])


 12.1177   6.4953   6.3333
  6.4953  11.5852   6.1527
  6.3351   6.1527  11.7114
[torch.FloatTensor of size 3x3]



In [8]:
print("logXi")
logXiXj = torch.log1p(S.repeat(f,1)) + torch.log1p(S.repeat(f,1)).t()

logXi


In [9]:
print(logXiXj[0:3, 0:3])


 24.2354  23.7029  23.8291
 23.7029  23.1705  23.2967
 23.8291  23.2967  23.4228
[torch.FloatTensor of size 3x3]



In [10]:
print("PMI")
a = time.time()
trace = math.log(torch.sum(S))
PMI = logXij + trace - logXiXj
print(time.time() - a)

PMI
2.617755651473999


In [11]:
print(trace)
print(PMI[0:3, 0:3])

17.287345107405866

 5.1697  0.0797 -0.2085
 0.0797  5.7021  0.1434
-0.2067  0.1434  5.5759
[torch.FloatTensor of size 3x3]



In [12]:
print("PPMI")
a = time.time()
PPMI = torch.max(PMI, torch.zeros(f,f))
print(time.time() - a)

PPMI
5.591378927230835


In [13]:
print(PPMI[0:3,0:3])


 5.1697  0.0797  0.0000
 0.0797  5.7021  0.1434
 0.0000  0.1434  5.5759
[torch.FloatTensor of size 3x3]



In [14]:
# write to hdf5
import h5py
with h5py.File('ppmi.hdf5', 'w') as f:
    dset = f.create_dataset("default", data=PPMI)