# Single Level Model
In our simplest model, we will just model each post.

Have found this to be a useful resource for a hierarchcal model example: https://github.com/pyro-ppl/pyro/blob/dev/examples/baseball.py
As well as https://pyro.ai/examples/forecasting_iii.html

# To start, we will use dummy data

In [1]:
import pandas as pd
import torch
import pyro
from pyro.infer import MCMC, NUTS
import pyro.distributions as dist
from pyro.distributions.util import scalar_like
from torch.distributions import constraints
import json

In [2]:
pyro.enable_validation(__debug__)
pyro.set_rng_seed(0)

First, we load the Reddit datasets

In [3]:
comments = []
with open('../data/results/Comments.json') as f:
    for line in f:
        comments.append(json.loads(line))

In [4]:
corrections = []
with open('../data/results/CorrectionPairs.json') as f:
    for line in f:
        corrections.append(json.loads(line))

In [5]:
news = []
with open('../data/results/NewsPairs.json') as f:
    for line in f:
        news.append(json.loads(line))

Gather relevant variables

In [6]:
news_dict = {}

for n in news:
    news_id = n['p']['id']
    news_num_c = n['p']['num_comments']
    news_type = n['r']['reviewRating']['isFakeStory']
    news_dict[news_id] = news_type, news_num_c

In [7]:
corr_dict = {}

for c in corrections:
    corr_id = c['p']['id']
    corr_num_c = c['p']['num_comments']
    corr_type = c['r']['reviewRating']['isFakeClaim']
    corr_dict[corr_id] = corr_type, corr_num_c

In [8]:
p_data = torch.empty((3, len(news_dict) + len(corr_dict)))

for i, (isFake, num_comments) in enumerate(news_dict.values()):
    p_data[0, i] = 1
    p_data[1, i] = 0 if not isFake else 1
    p_data[2, i] = num_comments
    
for i, (isFake, num_comments) in enumerate(corr_dict.values()):
    p_data[0, i+len(news_dict)] = 1
    p_data[1, i+len(news_dict)] = 2 if not isFake else 3
    p_data[2, i+len(news_dict)] = num_comments

Let's try using non-rectangular data to do the same thing. (Get rid of this type level and make it into a categorical variable instead!)

In [6]:
# Post-Level Data
p_data = torch.Tensor([[1, 100], [1, 250], [1, 125], [1, 150],
                       [1, 50],  [1, 100], [1, 150], [1, 125],
                       [1, 20],  [1, 40], [1, 30], [1, 35]])
p_data = p_data.transpose(0,1)
# dim 0: post-level vars: (bias, commentsFirstHour) 
# dim 1: obs (post)

In [7]:
y = torch.Tensor([1000., 3000., 1500., 1500.,  800., 2500., 1600., 1200.,  300.,  500., 1000.,  600.])

In [25]:
p_data.shape

torch.Size([2, 12])

In [9]:
p_data

tensor([[  1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.],
        [100., 250., 125., 150.,  50., 100., 150., 125.,  20.,  40.,  30.,  35.]])

In [10]:
y

tensor([1000., 3000., 1500., 1500.,  800., 2500., 1600., 1200.,  300.,  500.,
        1000.,  600.])

Post-Level Regression

y_p = phi_0 * bias + phi_1 * first_hour_comments + epsilon

In [26]:
# x is a 3D tensor
def model(p_data, y):
    num_p_indeps, num_posts = p_data.shape
        
    # define a prior for our regression variables
    phi_prior = dist.Normal(torch.zeros((num_p_indeps,1)),
                            10. * torch.ones((num_p_indeps,1))) # (num_p_indeps,1)
    
    phi = pyro.sample("phi", phi_prior)
    
    
    # for each post, use the correct set of coefficients to run our post-level regression
    with pyro.plate("post", num_posts, dim=-1) as p:
        
        # indep vars for this post
        indeps = p_data[:,p] 
        
        # calculate the mean
        mu = torch.matmul(phi.transpose(0,1), indeps)  # (num_p_indeps, 1).T  (num_p_indeps, num_posts)
        
        # sample
        pyro.sample("obs", dist.Normal(mu, 1000.), obs=y[p])


In [27]:
nuts_kernel = NUTS(model)

mcmc = MCMC(nuts_kernel, num_samples=2000, warmup_steps=1000)
mcmc.run(p_data, y)

hmc_samples = {k: v.detach().cpu().numpy() for k, v in mcmc.get_samples().items()}

Sample: 100%|██████████| 3000/3000 [00:45, 66.22it/s, step size=8.63e-01, acc. prob=0.918]


In [28]:
# Utility function to print latent sites' quantile information.
def summary_types(samples):
    site_stats = {}
    i = 0
    for site_name, values in samples.items():
        marginal_site = pd.DataFrame(values)
        describe = marginal_site.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()
        site_stats[site_name] = describe[["mean", "std", "5%", "25%", "50%", "75%", "95%"]]
        i += 1
    return site_stats

In [29]:
hmc_samples['phi'].shape

(2000, 2, 1)

In [30]:
m = pd.DataFrame(hmc_samples['phi'][:,:,0])
m.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()

Unnamed: 0,count,mean,std,min,5%,25%,50%,75%,95%,max
0,2000.0,0.360312,10.261937,-32.355705,-15.955612,-6.793513,0.230579,7.447685,17.548048,37.224907
1,2000.0,11.574994,2.344521,2.988956,7.606329,9.967288,11.586669,13.157027,15.47024,19.021179
