# Simple Model
In our simplest model, we will just model each post.

Have found this to be a useful resource for a hierarchcal model example: https://github.com/pyro-ppl/pyro/blob/dev/examples/baseball.py
As well as https://pyro.ai/examples/forecasting_iii.html

# To start, we will use dummy data

In [1]:
import pandas as pd
import torch
import pyro
from pyro.infer import MCMC, NUTS
import pyro.distributions as dist
from pyro.distributions.util import scalar_like
from torch.distributions import constraints
import json

In [2]:
pyro.enable_validation(__debug__)
pyro.set_rng_seed(0)

First, we load the Reddit datasets

In [3]:
comments = []
with open('../data/results/Comments.json') as f:
    for line in f:
        comments.append(json.loads(line))

In [4]:
corrections = []
with open('../data/results/CorrectionPairs.json') as f:
    for line in f:
        corrections.append(json.loads(line))

In [5]:
news = []
with open('../data/results/NewsPairs.json') as f:
    for line in f:
        news.append(json.loads(line))

Gather relevant variables

In [6]:
news_dict = {}

for n in news:
    news_id = n['p']['id']
    news_num_c = n['p']['num_comments']
    news_type = n['r']['reviewRating']['isFakeStory']
    news_dict[news_id] = news_type, news_num_c

In [7]:
corr_dict = {}

for c in corrections:
    corr_id = c['p']['id']
    corr_num_c = c['p']['num_comments']
    corr_type = c['r']['reviewRating']['isFakeClaim']
    corr_dict[corr_id] = corr_type, corr_num_c

In [8]:
p_data = torch.empty((3, len(news_dict) + len(corr_dict)))

for i, (isFake, num_comments) in enumerate(news_dict.values()):
    p_data[0, i] = 1
    p_data[1, i] = 0 if not isFake else 1
    p_data[2, i] = num_comments
    
for i, (isFake, num_comments) in enumerate(corr_dict.values()):
    p_data[0, i+len(news_dict)] = 1
    p_data[1, i+len(news_dict)] = 2 if not isFake else 3
    p_data[2, i+len(news_dict)] = num_comments

Let's try using non-rectangular data to do the same thing. (Get rid of this type level and make it into a categorical variable instead!)

In [13]:
# Post-Level Data
p_data = torch.Tensor([[1, 100], [1, 250], [1, 125], [1, 150],
                       [1, 50],  [1, 100], [1, 150], [1, 125],
                       [1, 20],  [1, 40], [1, 30], [1, 35]])
p_data = p_data.transpose(0,1)
# dim 0: post-level vars: (bias, commentsFirstHour) 
# dim 1: observation (a post.)

In [14]:
# type corresponding to each post
p_types = torch.Tensor([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2])

In [15]:
# Type-Level Data
t_data = torch.Tensor([[1, 1, 1]])
# dim 0: type-level vars: (bias)
# dim 1: observation (a type.)

In [16]:
y = torch.Tensor([1000., 3000., 1500., 1500.,  800., 2500., 1600., 1200.,  300.,  500., 1000.,  600.])

In [17]:
p_data.shape

torch.Size([2, 12])

In [18]:
p_types.shape

torch.Size([12])

In [19]:
t_data.shape

torch.Size([1, 3])

In [20]:
y.shape

torch.Size([12])

In [9]:
p_data

tensor([[  1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.],
        [100., 250., 125., 150.,  50., 100., 150., 125.,  20.,  40.,  30.,  35.]])

In [21]:
p_types

tensor([0., 0., 0., 0., 1., 1., 1., 1., 2., 2., 2., 2.])

In [22]:
t_data

tensor([[1., 1., 1.]])

In [23]:
y

tensor([1000., 3000., 1500., 1500.,  800., 2500., 1600., 1200.,  300.,  500.,
        1000.,  600.])

2 Regressions.

Post-Level Regression
y_pt = phi_0,t * bias + phi_1,t * first_hour_comments + epsilon_pt

Type-Level Regressions (2)
phi_0t = eta_00 * bias + eta_01 silliness_t + nu_0t

phi_1t = eta_10 * bias + eta_11 silliness_t + nu_1t

In [176]:
# Looped, but works!
def model(p_data, p_types, t_data, y):
    num_p_indeps, num_posts = p_data.shape
    num_t_indeps, num_types = t_data.shape
    
    # define a prior for our regression variables
    alpha = dist.Normal(torch.zeros((num_p_indeps, num_t_indeps)),
                            10. * torch.ones((num_p_indeps, num_t_indeps))) # (num_p_indeps, num_t_indeps)
    
    eta = pyro.sample("eta", alpha)
    
    
    for t in pyro.plate("type", num_types):
        mu_phi = torch.matmul(eta, t_data[:,t]) # (num_p_indeps, num_t_indeps) x (num_t_indeps, 1)
        
        phi = pyro.sample(f"phi_{t}", dist.Normal(mu_phi, 10.)) # num_p_indeps x 1
        
        
        repeated = p_types.long().repeat(num_p_indeps, 1)
        
        type_posts = p_data[repeated == t].reshape(num_p_indeps,-1)
        type_y = y[p_types.long() == t]
        
        
        # for each post, use the correct set of coefficients to run our post-level regression
        with pyro.plate(f"post_{t}", type_posts.shape[1]) as p:

            # indep vars for this post
            indeps = type_posts # 2x4 (num_p_indeps x len(type_posts))
            # using the same coefficients for everyone in this type!
            coefs = phi.reshape(num_p_indeps, 1).repeat(1, len(p)) # 2,4
            # calculate the mean
            mu = torch.mul(coefs, indeps).sum(dim=0)  # (num_p_indeps, 1).T  (num_p_indeps, num_posts)
#             mu = torch.dot(phi, indeps)  # (num_p_indeps, 1).T  (num_p_indeps, num_posts)
            
            
            # sample
            pyro.sample(f"obs_{t}", dist.Normal(mu, 1000.), obs=type_y[p])


In [158]:
# Attempt at vectorization
def model(p_data, p_types, t_data, y):
    num_p_indeps, num_posts = p_data.shape
    num_t_indeps, num_types = t_data.shape
    
    # define a prior for our regression variables
    alpha = dist.Normal(torch.zeros((num_p_indeps, num_t_indeps)),
                            10. * torch.ones((num_p_indeps, num_t_indeps))) # (num_p_indeps, num_t_indeps)
    
    eta = pyro.sample("eta", alpha)
    
    
    with pyro.plate("type", num_types, dim=-1) as t:
        mu_phi = torch.matmul(eta, t_data) # (num_p_indeps, num_t_indeps) x (num_t_indeps, num_types)
        
        phi = pyro.sample(f"phi", dist.Normal(mu_phi, 10.)) # (2, 3)
        
        
    # for each post, use the correct set of coefficients to run our post-level regression
    with pyro.plate(f"post", num_posts, dim=-1) as p:
        
        t = p_types[p].long() # (12)

        # indep vars for this post
        indeps = p_data # (2,12)
        
        coefs = phi[:,t] # (2,12) (num_p_indeps,num_posts)

        # calculate the mean: desired shape (num_posts, 1)
        mu = torch.mul(coefs, indeps).sum(dim=0)  # ((num_p_indeps, num_posts) .* (num_p_indeps, num_posts)).sum(over indeps)

        # sample
        pyro.sample(f"obs", dist.Normal(mu, 1000.), obs=y)


In [142]:
t = torch.Tensor([0, 0, 1, 1, 2, 2]).long()

In [163]:
phi = torch.Tensor([1, 2, 3, 4, 5, 6]).reshape(6,1)

In [167]:
phi.repeat(1,4).shape

torch.Size([6, 4])

In [None]:
nuts_kernel = NUTS(model)

mcmc = MCMC(nuts_kernel, num_samples=2000, warmup_steps=1000)
mcmc.run(p_data, p_types, t_data, y)

hmc_samples = {k: v.detach().cpu().numpy() for k, v in mcmc.get_samples().items()}

Warmup:   0%|          | 14/3000 [20:59, 90.00s/it, step size=5.47e+00, acc. prob=0.739]0]
Warmup:   0%|          | 0/3000 [13:59, ?it/s]
Warmup:   0%|          | 0/3000 [11:33, ?it/s]
Warmup:   0%|          | 0/3000 [10:55, ?it/s]
Warmup:   0%|          | 0/3000 [06:44, ?it/s]
Warmup:   0%|          | 0/3000 [05:54, ?it/s]
Sample:  95%|█████████▍| 2848/3000 [06:01,  8.36it/s, step size=4.24e-01, acc. prob=0.918]

In [119]:
# Utility function to print latent sites' quantile information.
def summary_types(samples):
    site_stats = {}
    i = 0
    for site_name, values in samples.items():
        marginal_site = pd.DataFrame(values)
        describe = marginal_site.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()
        site_stats[site_name] = describe[["mean", "std", "5%", "25%", "50%", "75%", "95%"]]
        i += 1
    return site_stats

In [120]:
hmc_samples

{'eta': array([[[-15.16477  ],
         [  2.3026276]],
 
        [[ -4.4097776],
         [ 28.584997 ]],
 
        [[  1.8208466],
         [  8.144741 ]],
 
        ...,
 
        [[ 24.801327 ],
         [ 11.130078 ]],
 
        [[  9.820871 ],
         [ 12.014492 ]],
 
        [[  6.0100384],
         [  7.382392 ]]], dtype=float32),
 'phi': array([[[-15.491273  , -16.56714   , -22.088516  ],
         [ 14.853753  ,   7.3520255 ,   5.3977065 ]],
 
        [[  3.84382   ,   2.338452  ,   1.7843876 ],
         [ 10.871811  ,  22.551394  ,  37.513165  ]],
 
        [[ -0.12052822,   4.163312  ,  -5.3647647 ],
         [ 13.280857  ,  16.370468  ,  24.460884  ]],
 
        ...,
 
        [[ 29.22926   ,  21.602962  ,  -1.7448103 ],
         [ 10.656149  ,  16.632578  ,  27.865479  ]],
 
        [[ 16.828297  ,  17.897223  ,  41.321125  ],
         [  7.7527704 ,  11.433988  ,  13.210835  ]],
 
        [[ 24.283442  ,  22.22405   ,   1.6297889 ],
         [ 14.81048   ,  12.294846  ,

In [121]:
hmc_samples['phi'].shape

(2000, 2, 3)

In [122]:
m = pd.DataFrame(hmc_samples['eta'][:,:,0])
m.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()

Unnamed: 0,count,mean,std,min,5%,25%,50%,75%,95%,max
0,2000.0,-0.258362,9.88874,-38.321423,-16.17179,-7.283419,-0.12562,6.734012,15.776669,33.396301
1,2000.0,8.989322,5.587269,-9.466851,-0.101335,5.179688,8.906022,12.623037,18.260716,28.584997


In [123]:
m = pd.DataFrame(hmc_samples['phi'][:,:,0])
m.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()

Unnamed: 0,count,mean,std,min,5%,25%,50%,75%,95%,max
0,2000.0,-0.391963,14.104968,-54.705257,-24.073265,-10.018052,0.029598,9.364626,22.04299,50.426144
1,2000.0,11.209664,2.860804,0.785737,6.674283,9.229479,11.220575,13.189994,15.822662,21.952791


In [124]:
m = pd.DataFrame(hmc_samples['phi'][:,:,1])
m.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()

Unnamed: 0,count,mean,std,min,5%,25%,50%,75%,95%,max
0,2000.0,-0.202742,13.990156,-45.48349,-23.275968,-9.797287,-0.21206,9.311126,22.564047,50.049248
1,2000.0,12.694663,4.225892,-0.980016,5.900632,9.938076,12.582331,15.586386,19.706371,25.625191


In [125]:
m = pd.DataFrame(hmc_samples['phi'][:,:,2])
m.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()

Unnamed: 0,count,mean,std,min,5%,25%,50%,75%,95%,max
0,2000.0,-0.29418,14.321106,-52.169037,-23.003955,-10.378389,-0.479738,9.500374,23.121919,45.832973
1,2000.0,11.926276,9.215327,-17.470591,-3.440746,5.615487,11.920063,18.208272,26.886878,43.751766


In [70]:
p_types

tensor([0., 0., 0., 0., 1., 1., 1., 1., 2., 2., 2., 2.])

In [71]:
p_data

tensor([[  1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.],
        [100., 250., 125., 150.,  50., 100., 150., 125.,  20.,  40.,  30.,  35.]])

In [72]:
y

tensor([1000., 3000., 1500., 1500.,  800., 2500., 1600., 1200.,  300.,  500.,
        1000.,  600.])