# A Latent Space Model for Hypergraphs

* Let $G=(V,E)$ be a hypergraph, where $E$ is the collection of hyperedges.
* Let $H$ be the collection of admisible combinations of nodes, (e.g., if we only consider hyperedges of size less than 10). 
* $G$ is modeled as a collection of random variables ${X_h: h\in H}$.
* $X_h\sim Bernoulli(p_h)$.
* The mean $p_h$ depends on the features of nodes in $h$: $$p_h=\frac{e^{-\alpha|h|}\sum_k\prod_{i\in h}\theta_{ik}}{1+e^{-\alpha|h|}\sum_k\prod_{i\in h}\theta_{ik}},$$ where $\theta_i$: the latent feature vector for node $i$.
* Assume $X_h$'s are independent of each other given $\theta=(\theta_i)$.
* The distribution of $G$ is given by $$p(G|\theta)=\prod_{h\in H} p(x_h|\theta)$$.
* Take a Bayesian approach to estimate $\theta$ and sample $\theta$ from the posterior using MCMC.
    * Propose $\theta'$.
    * Draw $G'$ given $\theta'$.
        * Start from $G$.
        * Every step pick $h\in H$ at random.
        * Draw $X_e$ from $Poisson(\lambda_h(\theta'))$.
    * Move to $\theta'$ with probability $\rho$.

In [None]:
%matplotlib inline
import pylab as plt
import numpy as np
import scipy.sparse as ssp
import random
from collections import Counter
from scipy.stats import dirichlet, truncnorm
import sys
import pickle
import time
from string import lower
from itertools import combinations, izip, chain
from collections import Counter
from scipy.stats import pearsonr, norm, poisson
from scipy.misc import comb

In [None]:
class Stopwatch:
    start_time=None
    def go(self,msg=''):
        if msg:
            print msg,
        self.start_time=time.time()
        sys.stdout.flush()
    def stop(self,msg=''):
        if msg:
            print "{}: {} seconds".format(msg,time.time()-self.start_time)
        else:
            print "Elapsed time: {} seconds".format(time.time()-self.start_time)
        sys.stdout.flush()
    def check(self):
        return time.time()-self.start_time


def LoadDate(filename):
    tic=Stopwatch()
    print "Loading paper dates %s from disk..." % filename,
    tic.go()
    pkl_file = open(filename, 'rb')
    A=pickle.load(pkl_file)
    pkl_file.close()
    tic.stop()
    return A


def LoadData(filename):
    tic=Stopwatch()
    print "Loading file %s from disk..." % filename,
    tic.go()
    pkl_file = open(filename, 'rb')   
    (row,col) = pickle.load(pkl_file)        
    pkl_file.close()
    A=ssp.coo_matrix((np.ones(len(row),dtype=np.int8),(row,col)),shape=(19916562,col.max()+1),dtype=np.float)
    tic.stop()
    return A

### Initialization

* Read in the hypergraph of Medline

In [2]:
tic=Stopwatch()

tic.go('Loading citation data...')
citations=pickle.load(open('../hypergraph/citations.pkl'))
tic.stop()
# Load hypergraphs
thing=['Author','Chemical','Disease','Method']
G=[]
for i in range(1,4):
    G.append(LoadData('../hypergraph/'+lower(thing[i])+'.pkl'))
G=ssp.hstack(G).tocsr()

paper_dates=LoadDate('../hypergraph/date.pkl') # Load publicatioin dates

id2chemical=pickle.load(open('../hypergraph/Citation/id2chemical.pkl'))
id2disease=pickle.load(open('../hypergraph/Citation/id2disease.pkl'))
id2method=pickle.load(open('../hypergraph/Citation/id2method.pkl'))
id2name=np.array(id2chemical+id2disease+id2method)

NameError: name 'Stopwatch' is not defined

In [None]:
G0=G[((paper_dates==1970)).nonzero()[0],:]
E0=set((tuple(row.nonzero()[1]) for row in G0 if row.size>0)) # set of hyperedges

In [None]:
E=random.sample(E0,10000)
M = len(E) # number of edges
V = sorted(set(chain(*list(E)))) #range(G0.shape[1]) # list of nodes
N = max(V)+1 # number of nodes
K = 10 # dimension of hidden space
averageSize=5
theta = np.random.lognormal(1,1,size=(K,N))
alpha=10
Y = E.copy()

In [None]:
def ph(h,theta,alpha):
    return theta[:,list(h)].prod(axis=1).sum()*np.exp(-len(h)*alpha)
    
def sampleG(theta,alpha,E):
    change=dict()
    for i in xrange(20000):
        n=np.random.poisson(averageSize)
        e=tuple(sorted(random.sample(V,min(max(n,1),N))))
        lambdae=ph(e,theta,alpha)
        if random.random()<=lambdae:
            if e not in E:
                change[e]=1
        else:
            if e in E:
                change[e]=0
    return change

#### Simulate $\theta$

In [None]:
aRate=0.0
for i in xrange(5000):
    # Propose new theta
    thetaP = np.random.lognormal(1,1,size=(K,N))
    # Sample Y' from new theta
    YP=sampleG(thetaP,alpha,E)
    # Calculate transition probability
    rho=1.0
    for e in YP:
        lambdae=ph(e,theta,alpha)
        lambdaep=Lambda(e,thetaP,alpha)
        if lambdae==0 or lambdaep==0:
            rho=0
            break
        rho*=(lambdae/lambdaep)**(EP[e]-E[e])
    #rho*=reduce(lambda a,b: a*b, ( truncnorm.pdf(thetaP[j,i],(0-1.0/K)/sigma,(1-1.0/K)/sigma,1.0/K,sigma)/truncnorm.pdf(theta[j,i],(0-1.0/K)/sigma,(1-1.0/K)/sigma,1.0/K,sigma) for i in V for j in xrange(K)))
    #rho*=reduce(lambda a,b: a*b, ( dirichlet.pdf(thetaP[:,i],[10]*K)/dirichlet.pdf(theta[:,i],[10]*K) for i in V ))
    # Move
    if random.random()<rho:
        aRate+=1
        theta=thetaP
    
    # Propose new alpha
    alphaP=abs(alpha+np.random.normal(scale=0.1))
    # Sample G' from new alpha
    EP=sampleG(theta,alphaP,E)
    # Calculate transition probability
    rho=1.0
    for e in EP:
        lambdae=Lambda(e,theta,alpha)
        lambdaep=Lambda(e,theta,alphaP)
        if lambdae==0 or lambdaep==0:
            rho=0
            break
        rho*=(lambdae/lambdaep)**(EP[e]-E[e])
    rho*=norm.pdf(alphaP,1,2)/norm.pdf(alpha,1,2)
    # Move
    if random.random()<rho:
        alpha=alphaP

In [None]:
Counter([len(i) for i in EP.keys()]), aRate, len(EP), alpha

In [None]:
L=[]
samples=[]
LRandom=[]
randomSamples=[]
for e in E:
    if len(e)<2:
        continue
    samples.append(e)
    L.append(poisson.pmf(E[e],Lambda(e,theta,alpha)))
    while True:
        n=len(e)#np.random.poisson(len(e))
        e0=tuple(sorted(random.sample(V,min(max(n,1),N)),reverse=True))
        if e0 not in E0:
            break
    randomSamples.append(e0)
    LRandom.append(poisson.pmf(1,Lambda(e0,theta,alpha)))

plt.hist([L,LRandom])

In [None]:
max(L),max(LRandom)

In [None]:
zip(randomSamples,samples)

In [None]:
Lambda((14544, 11249, 9499),theta,alpha), Lambda( (11763, 2528, 1879),theta,alpha)

In [None]:
theta[:,list((14544, 11249, 9499))],theta[:,list( (11763, 2528, 1879))]