# Simple Model
In our simpler model, we will just model each post as posting about a story coming from one of three groups:
- Factual, Disputed Story
- Fake, Disputed Story
- Corrective Story

Have found this to be a useful resource for a hierarchcal model example: https://github.com/pyro-ppl/pyro/blob/dev/examples/baseball.py
As well as https://pyro.ai/examples/forecasting_iii.html

# To start, we will use dummy data

In [1]:
import pandas as pd
import torch
import pyro
from pyro.infer import MCMC, NUTS
import pyro.distributions as dist
from pyro.distributions.util import scalar_like
from torch.distributions import constraints
import json

In [2]:
pyro.enable_validation(__debug__)
pyro.set_rng_seed(0)

First, we load the Reddit datasets

In [4]:
comments = []
with open('../data/results/Comments.json') as f:
    for line in f:
        comments.append(json.loads(line))

In [5]:
corrections = []
with open('../data/results/CorrectionPairs.json') as f:
    for line in f:
        corrections.append(json.loads(line))

In [6]:
news = []
with open('../data/results/NewsPairs.json') as f:
    for line in f:
        news.append(json.loads(line))

Gather relevant variables

In [6]:
news_dict = {}

for n in news:
    news_id = n['p']['id']
    news_num_c = n['p']['num_comments']
    news_type = n['r']['reviewRating']['isFakeStory']
    news_dict[news_id] = news_type, news_num_c

In [7]:
corr_dict = {}

for c in corrections:
    corr_id = c['p']['id']
    corr_num_c = c['p']['num_comments']
    corr_type = c['r']['reviewRating']['isFakeClaim']
    corr_dict[corr_id] = corr_type, corr_num_c

In [8]:
p_data = torch.empty((3, len(news_dict) + len(corr_dict)))

for i, (isFake, num_comments) in enumerate(news_dict.values()):
    p_data[0, i] = 1
    p_data[1, i] = 0 if not isFake else 1
    p_data[2, i] = num_comments
    
for i, (isFake, num_comments) in enumerate(corr_dict.values()):
    p_data[0, i+len(news_dict)] = 1
    p_data[1, i+len(news_dict)] = 2 if not isFake else 3
    p_data[2, i+len(news_dict)] = num_comments

Let's try using non-rectangular data to do the same thing. (Get rid of this type level and make it into a categorical variable instead!)

In [35]:
# Post-Level Data
p_data = torch.Tensor([[1, 100, 0, 1000], [1, 250, 0, 3000], [1, 125, 0, 1500], [1, 150, 0, 1500],
                       [1, 50,  1, 800],  [1, 100, 1, 2500], [1, 150, 1, 1600], [1, 125, 1, 1200],
                       [1, 20,  2, 300],  [1, 40,  2,  500], [1, 30,  2, 1000], [1, 35,  2, 600]])
p_data = p_data.transpose(0,1)
# dim 0: post-level vars: (bias, commentsFirstHour, type, Engagement) 
# dim 1: obs (post)

In [36]:
# Type-Level Data
t_data = torch.Tensor([[1], [1], [1]])
t_data = t_data.transpose(0,1)
# dim 0: type-level vars: (Just bias for now)
# dim 1: type (0: Fake, 1: Fact, 2: Corrective)

In [37]:
t_data.shape

torch.Size([1, 3])

In [38]:
p_data.shape

torch.Size([4, 12])

In [39]:
y = p_data[-1,:]
p_data = p_data[:-1,:]

In [40]:
p_data

tensor([[  1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.],
        [100., 250., 125., 150.,  50., 100., 150., 125.,  20.,  40.,  30.,  35.],
        [  0.,   0.,   0.,   0.,   1.,   1.,   1.,   1.,   2.,   2.,   2.,   2.]])

In [41]:
p_data.shape

torch.Size([3, 12])

In [42]:
y

tensor([1000., 3000., 1500., 1500.,  800., 2500., 1600., 1200.,  300.,  500.,
        1000.,  600.])

Type-Level Regressions

phi_0,t = eta_0^T t_data + epsilon_0

phi_1,t = eta_1^T t_data + epsilon_1

Post-Level Regression

y_pt = phi_0,t * bias + phi_1,t * first_hour_comments

In [None]:
# x is a 3D tensor
def model(t_data, p_data, y):
    num_t_indeps, num_types = t_data.shape
    num_p_indeps, num_posts = p_data.shape
    
    num_p_indeps -= 1 # The last p_indep is just the type.
    
    # construct necessary plates over each level
    type_plate = pyro.plate("type", num_types, dim=-1)
    post_plate = pyro.plate("post", num_posts, dim=-1)
    t_indep_plate = pyro.plate("t_indep", num_t_indeps, dim=-1)
    p_indep_plate = pyro.plate("p_indep", num_p_indeps, dim=-2)
    
    
    # type-level regression variables (shared across all types)
    # one coef for each type-level indep var for each of num_p_indeps type-level regressions
    # each row are the coefficients for a different type-level regression
    # eta in proposal
    eta = torch.empty((num_p_indeps, num_t_indeps,)) # (2,1)
    with t_indep_plate as ti:
        with p_indep_plate as pi:
            eta[pi, ti] = pyro.sample("eta", dist.Normal(torch.zeros_like(eta), 10)).reshape(2)
        

    # Run a type-level regression for the coef on each post-level variable
    # phi in proposal
    phi = torch.empty((num_p_indeps, num_types,)) # (2,3)
    with type_plate as t:
        with p_indep_plate as pi:
            phi_mu = torch.matmul(eta[pi,:], t_data[:,t])  # (num_p_indeps, num_t_indeps)   (num_t_indeps, num_types)
            # phi_mu = torch.dot(eta[pi,:], t_data[:,t])

            # get the restulting type-varying post-level coefficient
            phi_prior = dist.Normal(phi_mu, 10. * torch.ones_like(phi_mu))
            samp = pyro.sample("phi", phi_prior)
            phi = samp
    
    # for each post, use the correct set of coefficients to run our post-level regression
    with post_plate as p:
        t = p_data[-1,p].long()
        
        coefs = phi[:,t] # phi for this post.
        indeps = p_data[:-1,p] # indep vars for this post
        # calculate the mean
        mu = torch.matmul(coefs.T, indeps)  # (num_p_indeps, 1).T  (num_p_indeps, num_posts)
        # mu = torch.dot(coefs, indeps)
        
        # sample
        pyro.sample("obs", dist.Normal(mu, 1000.), obs=y[p])


In [None]:
# # x is a 3D tensor
# def guide(t_data, p_data, y):
#     num_t_indeps, num_types = t_data.shape
#     num_p_indeps, num_posts = p_data.shape
    
#     num_p_indeps -= 1 # The last p_indep is just the type.
    
#     # construct necessary plates over each level
#     type_plate = pyro.plate("type", num_types)
#     post_plate = pyro.plate("post", num_posts)
#     t_indep_plate = pyro.plate("t_indep", num_t_indeps)
#     p_indep_plate = pyro.plate("p_indep", num_p_indeps)
    
    
    
#     type_level_coef_locs = torch.empty((num_p_indeps, num_t_indeps,))   # (2,1)
#     type_level_coef_scales = torch.empty((num_p_indeps, num_t_indeps,)) # (2,1)
    
#     for pi in p_indep_plate:
#         for ti in t_indep_plate:
#             type_level_coef_locs[pi, ti] = pyro.param(f"type_level_coef_{ti}_on_pi_{pi}_loc", 
#                                                       torch.Tensor(0.))
#             type_level_coef_scales[pi, ti] = pyro.param(f"type_level_coef_{ti}_on_pi_{pi}_scale", 
#                                                         torch.Tensor(1.), 
#                                                         constraint=constraints.positive)
    
#     # type-level regression variables (shared across all types)
#     # one coef for each type-level indep var for each of num_p_indeps type-level regressions
#     type_level_coefs = torch.empty((num_p_indeps, num_t_indeps,)) # (2,1)
#     for pi in p_indep_plate:
#         for ti in t_indep_plate:
#             type_level_coefs[pi, ti] = pyro.param(f"type_level_coef_{ti}_on_pi_{pi}", dist.Normal(type_level_coef_locs[pi,ti], type_level_coef_scales[pi,ti]))
        

#     type_varying_post_level_coef_scales = torch.empty((num_p_indeps, num_types,))
#     for pi in p_indep_plate:
#         for t in type_plate:
#             type_varying_post_level_coef_scales[pi,t] = pyro.param(f"type_{t}_post_level_coef_{pi}_scale", torch.Tensor(1.), constraint=constraints.positive)
    
#     # Run a type-level regression for the coef on each post-level variable
#     type_varying_post_level_coefs = torch.empty((num_p_indeps, num_types,))
#     for pi in p_indep_plate:
#         for t in type_plate:
#             type_varying_post_level_coef_mu = torch.dot(type_level_coefs[pi,:], t_data[:,t])
# #             type_varying_post_level_coef_std = pyro.sample(f"type_{t}_post_level_coef_{pi}_std", dist.Uniform(0., 10.))

#             # get the restulting type-varying post-level coefficient
#             type_varying_post_level_coefs[pi,t] = pyro.param(f"type_{t}_post_level_coef_{pi}", dist.Normal(type_varying_post_level_coef_mu, type_varying_post_level_coef_scales[pi,t]))
        

In [None]:
nuts_kernel = NUTS(model)

mcmc = MCMC(nuts_kernel, num_samples=2000, warmup_steps=1000)
mcmc.run(t_data, p_data, y)

hmc_samples = {k: v.detach().cpu().numpy() for k, v in mcmc.get_samples().items()}

In [None]:
TYPES = ["Fake", "Fact", "Corrective"]
# Utility function to print latent sites' quantile information.
def summary_types(samples):
    site_stats = {}
    i = 0
    for site_name, values in samples.items():
#         values = values.reshape((values.shape[0], values.shape[1]))
        marginal_site = pd.DataFrame(values)
        describe = marginal_site.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()
        site_stats[site_name] = describe[["mean", "std", "5%", "25%", "50%", "75%", "95%"]]
#         site_stats[site_name]["type"] = TYPES
        i += 1
    return site_stats

Type-Level Regressions

phi_0,t = eta_0^T t_data + epsilon_0

phi_1,t = eta_1^T t_data + epsilon_1

Post-Level Regression

y_pt = phi_0,t * bias + phi_1,t * first_hour_comments

In [None]:
m = pd.DataFrame(hmc_samples['eta'][:,:,0])
m.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()

In [None]:
m = pd.DataFrame(hmc_samples['phi'][:,0,:])
m.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()

In [None]:
m = pd.DataFrame(hmc_samples['phi'][:,1,:])
m.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()

In [None]:
hmc_samples

In [None]:
for site, values in summary_types(hmc_samples).items():
    print("Coefficient: {}".format(site))
    print(values, "\n")

In [44]:
# # x is a 3D tensor
# def guide(t_data, p_data, y):
#     num_t_indeps, num_types = t_data.shape
#     num_p_indeps, num_posts = p_data.shape
    
#     num_p_indeps -= 1 # The last p_indep is just the type.
    
#     # construct necessary plates over each level
#     type_plate = pyro.plate("type", num_types)
#     post_plate = pyro.plate("post", num_posts)
#     t_indep_plate = pyro.plate("t_indep", num_t_indeps)
#     p_indep_plate = pyro.plate("p_indep", num_p_indeps)
    
    
    
#     type_level_coef_locs = torch.empty((num_p_indeps, num_t_indeps,))   # (2,1)
#     type_level_coef_scales = torch.empty((num_p_indeps, num_t_indeps,)) # (2,1)
    
#     for pi in p_indep_plate:
#         for ti in t_indep_plate:
#             type_level_coef_locs[pi, ti] = pyro.param(f"type_level_coef_{ti}_on_pi_{pi}_loc", 
#                                                       torch.Tensor(0.))
#             type_level_coef_scales[pi, ti] = pyro.param(f"type_level_coef_{ti}_on_pi_{pi}_scale", 
#                                                         torch.Tensor(1.), 
#                                                         constraint=constraints.positive)
    
#     # type-level regression variables (shared across all types)
#     # one coef for each type-level indep var for each of num_p_indeps type-level regressions
#     type_level_coefs = torch.empty((num_p_indeps, num_t_indeps,)) # (2,1)
#     for pi in p_indep_plate:
#         for ti in t_indep_plate:
#             type_level_coefs[pi, ti] = pyro.param(f"type_level_coef_{ti}_on_pi_{pi}", dist.Normal(type_level_coef_locs[pi,ti], type_level_coef_scales[pi,ti]))
        

#     type_varying_post_level_coef_scales = torch.empty((num_p_indeps, num_types,))
#     for pi in p_indep_plate:
#         for t in type_plate:
#             type_varying_post_level_coef_scales[pi,t] = pyro.param(f"type_{t}_post_level_coef_{pi}_scale", torch.Tensor(1.), constraint=constraints.positive)
    
#     # Run a type-level regression for the coef on each post-level variable
#     type_varying_post_level_coefs = torch.empty((num_p_indeps, num_types,))
#     for pi in p_indep_plate:
#         for t in type_plate:
#             type_varying_post_level_coef_mu = torch.dot(type_level_coefs[pi,:], t_data[:,t])
# #             type_varying_post_level_coef_std = pyro.sample(f"type_{t}_post_level_coef_{pi}_std", dist.Uniform(0., 10.))

#             # get the restulting type-varying post-level coefficient
#             type_varying_post_level_coefs[pi,t] = pyro.param(f"type_{t}_post_level_coef_{pi}", dist.Normal(type_varying_post_level_coef_mu, type_varying_post_level_coef_scales[pi,t]))
        

In [104]:
nuts_kernel = NUTS(model)

mcmc = MCMC(nuts_kernel, num_samples=2000, warmup_steps=1000)
mcmc.run(t_data, p_data, y)

hmc_samples = {k: v.detach().cpu().numpy() for k, v in mcmc.get_samples().items()}

Sample: 100%|██████████| 3000/3000 [01:11, 41.99it/s, step size=4.60e-01, acc. prob=0.895]


In [105]:
TYPES = ["Fake", "Fact", "Corrective"]
# Utility function to print latent sites' quantile information.
def summary_types(samples):
    site_stats = {}
    i = 0
    for site_name, values in samples.items():
#         values = values.reshape((values.shape[0], values.shape[1]))
        marginal_site = pd.DataFrame(values)
        describe = marginal_site.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()
        site_stats[site_name] = describe[["mean", "std", "5%", "25%", "50%", "75%", "95%"]]
#         site_stats[site_name]["type"] = TYPES
        i += 1
    return site_stats

Type-Level Regressions

phi_0,t = eta_0^T t_data + epsilon_0

phi_1,t = eta_1^T t_data + epsilon_1

Post-Level Regression

y_pt = phi_0,t * bias + phi_1,t * first_hour_comments

In [143]:
m = pd.DataFrame(hmc_samples['eta'][:,:,0])
m.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()

Unnamed: 0,count,mean,std,min,5%,25%,50%,75%,95%,max
0,2000.0,1.706751,10.270098,-29.916935,-15.337079,-5.248812,1.612831,8.839247,18.441372,37.313736
1,2000.0,9.044466,4.997986,-8.644859,0.661086,5.803686,8.933012,12.146065,17.423727,26.772738


In [138]:
m = pd.DataFrame(hmc_samples['phi'][:,0,:])
m.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()

Unnamed: 0,count,mean,std,min,5%,25%,50%,75%,95%,max
0,2000.0,2.196438,14.104898,-50.165806,-21.042094,-7.121529,2.229118,11.615298,25.494761,47.577095
1,2000.0,2.341028,14.508185,-49.305668,-21.621042,-7.537164,2.469201,12.378116,25.953935,51.454567
2,2000.0,2.311025,14.554549,-65.808296,-21.430155,-7.208467,2.319745,12.079975,26.343111,48.761379


In [139]:
m = pd.DataFrame(hmc_samples['phi'][:,1,:])
m.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()

Unnamed: 0,count,mean,std,min,5%,25%,50%,75%,95%,max
0,2000.0,12.135343,1.202159,7.908581,10.209356,11.277381,12.104874,12.982046,14.066499,15.807629
1,2000.0,12.133224,1.185028,7.895435,10.183636,11.32802,12.143082,12.947352,14.082853,16.358557
2,2000.0,12.157056,1.250139,8.331654,10.151919,11.260973,12.143256,12.998227,14.22685,15.994568


In [122]:
hmc_samples

{'eta': array([[[-4.3591666],
         [ 8.260902 ]],
 
        [[-5.7180996],
         [10.742426 ]],
 
        [[-4.659203 ],
         [12.412684 ]],
 
        ...,
 
        [[ 3.0134025],
         [17.954956 ]],
 
        [[-8.488964 ],
         [17.422304 ]],
 
        [[ 0.9033203],
         [ 1.0316684]]], dtype=float32),
 'phi': array([[[  0.20939255,   5.0939555 , -11.443418  ],
         [ 10.755408  ,   9.72297   ,  10.885258  ]],
 
        [[ -0.35368305,   1.3452442 , -10.354859  ],
         [ 11.450345  ,  10.128931  ,   9.690924  ]],
 
        [[ -1.8811574 ,   2.898465  , -13.184977  ],
         [ 11.258858  ,  10.710872  ,  10.185942  ]],
 
        ...,
 
        [[-12.598831  ,  -8.250435  ,  -3.7482057 ],
         [ 11.766134  ,  11.714023  ,  11.396531  ]],
 
        [[ -2.9567347 ,  13.471781  ,   2.5307357 ],
         [ 10.347951  ,  12.134154  ,  10.812072  ]],
 
        [[  3.879755  ,  -3.5656176 ,  13.715727  ],
         [ 11.789518  ,  12.360553  ,  12.128419 

In [107]:
for site, values in summary_types(hmc_samples).items():
    print("Coefficient: {}".format(site))
    print(values, "\n")

ValueError: Must pass 2-d input