# Single Level Model
In our simplest model, we will just model each post.

Have found this to be a useful resource for a hierarchcal model example: https://github.com/pyro-ppl/pyro/blob/dev/examples/baseball.py
As well as https://pyro.ai/examples/forecasting_iii.html

# To start, we will use dummy data

In [1]:
import numpy as np
import pandas as pd
import torch
import pyro
from pyro.infer import MCMC, NUTS
import pyro.distributions as dist
from pyro.distributions.util import scalar_like
from torch.distributions import constraints
import json

In [2]:
pyro.enable_validation(__debug__)
pyro.set_rng_seed(0)

First, we load the Reddit datasets

In [3]:
comments = dict()
with open('../data/results/Comments.json') as f:
    for line in f:
        post = json.loads(line)
        comments[post['pid']] = post['api_num_comments'], post['comments']

In [4]:
corrections = []
with open('../data/results/CorrectionPairs.json') as f:
    for line in f:
        corrections.append(json.loads(line))

In [5]:
news = []
with open('../data/results/NewsPairs.json') as f:
    for line in f:
        news.append(json.loads(line))

Gather relevant variables

In [7]:
def processData(data, items, comments, offset=0):
    for idx, n in enumerate(items):
        i = idx + offset
        isNews = 'isFakeStory' in n['r']['reviewRating']
        news_id = n['p']['id']
        
        num_cmts, cmts = comments[news_id]
        c_body_lens = []
        c_ups = []
        c_downs = []
        unique_authors = set()
        for c in cmts:
            c_body_lens.append(c['body_len'])
            c_ups.append(c['ups'])
            c_downs.append(c['downs'])
            if c['author']:
                unique_authors.add(c['author'])

            hours = c['delta_seconds'] / 3600.0
            if hours <= 0.25:   # cmts in 15 mins
                data[i, 7] += 1
            if hours <= 0.5:    # cmts in 30 mins
                data[i, 8] += 1
            if hours <= 1:      # cmts in 1 hour
                data[i, 9] += 1
            if hours <= 2:      # cmts in 2 hours
                data[i, 10] += 1
            if hours <= 3:      # cmts in 3 hours
                data[i, 11] += 1
        
        if isNews:
            data[i, 0] = 1 if n['r']['reviewRating']['isFakeStory'] else 0
        else:
            data[i, 0] = 3 if n['r']['reviewRating']['isFakeClaim'] else 2
        data[i, 1] = num_cmts
        data[i, 2] = np.mean(c_body_lens) if c_body_lens else 0.
        data[i, 3] = np.std(c_body_lens) if c_body_lens else 0.
        data[i, 4] = np.mean(c_ups) if c_ups else 0.
        data[i, 5] = np.std(c_ups) if c_ups else 0.
        data[i, 6] = len(unique_authors) if unique_authors else 0.
    return data

In [8]:
num_p_indep = 12
data = torch.zeros((len(news) + len(corrections), num_p_indep))

data = processData(data, news, comments)
data = processData(data, corrections, comments, offset=len(news))

Variables (in order):

0. type
1. num_comments
2. comment_length_avg
3. comment_length_std
4. comment_upvotes_avg
5. comment_upvotes_std
6. num_unique_comment_authors
7. num_comments in first 15 mins
8. num_comments in first 30 mins
9. num_comments in first 1 hour
10. num_comments in first 2 hours
11. num_comments in first 3 hours

In [11]:
data[0,:]

tensor([  0.0000, 132.0000, 262.4167, 371.5136,   1.9394,   3.9882,  35.0000,
          0.0000,   0.0000,   2.0000,   2.0000,   3.0000])

Let's try using non-rectangular data to do the same thing. (Get rid of this type level and make it into a categorical variable instead!)

In [10]:
p_data.T

tensor([[  1.,   0., 131.],
        [  1.,   0.,   0.],
        [  1.,   1.,   1.],
        ...,
        [  1.,   3.,   0.],
        [  1.,   3.,   1.],
        [  1.,   3.,   6.]])

In [17]:
# Post-Level Data
p_data = torch.Tensor([[1, 100], [1, 250], [1, 125], [1, 150],
                       [1, 50],  [1, 100], [1, 150], [1, 125],
                       [1, 20],  [1, 40], [1, 30], [1, 35]])
p_data = p_data.transpose(0,1)
p_data
# dim 0: post-level vars: (bias, commentsFirstHour) 
# dim 1: obs (post)

tensor([[  1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.],
        [100., 250., 125., 150.,  50., 100., 150., 125.,  20.,  40.,  30.,  35.]])

In [18]:
# Post-Level Data
p_data = torch.Tensor([[1, 100], [1, 250], [1, 125], [1, 150],
                       [1, 50],  [1, 100], [1, 150], [1, 125],
                       [1, 20],  [1, 40], [1, 30], [1, 35]])

In [23]:
p_data.T

tensor([[  1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.],
        [100., 250., 125., 150.,  50., 100., 150., 125.,  20.,  40.,  30.,  35.]])

In [7]:
y = torch.Tensor([1000., 3000., 1500., 1500.,  800., 2500., 1600., 1200.,  300.,  500., 1000.,  600.])

In [25]:
p_data.shape

torch.Size([2, 12])

In [9]:
p_data

tensor([[  1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.],
        [100., 250., 125., 150.,  50., 100., 150., 125.,  20.,  40.,  30.,  35.]])

In [10]:
y

tensor([1000., 3000., 1500., 1500.,  800., 2500., 1600., 1200.,  300.,  500.,
        1000.,  600.])

Post-Level Regression

y_p = phi_0 * bias + phi_1 * first_hour_comments + epsilon

In [26]:
# x is a 3D tensor
def model(p_data, y):
    num_p_indeps, num_posts = p_data.shape
        
    # define a prior for our regression variables
    phi_prior = dist.Normal(torch.zeros((num_p_indeps,1)),
                            10. * torch.ones((num_p_indeps,1))) # (num_p_indeps,1)
    
    phi = pyro.sample("phi", phi_prior)
    
    
    # for each post, use the correct set of coefficients to run our post-level regression
    with pyro.plate("post", num_posts, dim=-1) as p:
        
        # indep vars for this post
        indeps = p_data[:,p] 
        
        # calculate the mean
        mu = torch.matmul(phi.transpose(0,1), indeps)  # (num_p_indeps, 1).T  (num_p_indeps, num_posts)
        
        # sample
        pyro.sample("obs", dist.Normal(mu, 1000.), obs=y[p])


In [27]:
nuts_kernel = NUTS(model)

mcmc = MCMC(nuts_kernel, num_samples=2000, warmup_steps=1000)
mcmc.run(p_data, y)

hmc_samples = {k: v.detach().cpu().numpy() for k, v in mcmc.get_samples().items()}

Sample: 100%|██████████| 3000/3000 [00:45, 66.22it/s, step size=8.63e-01, acc. prob=0.918]


In [28]:
# Utility function to print latent sites' quantile information.
def summary_types(samples):
    site_stats = {}
    i = 0
    for site_name, values in samples.items():
        marginal_site = pd.DataFrame(values)
        describe = marginal_site.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()
        site_stats[site_name] = describe[["mean", "std", "5%", "25%", "50%", "75%", "95%"]]
        i += 1
    return site_stats

In [29]:
hmc_samples['phi'].shape

(2000, 2, 1)

In [30]:
m = pd.DataFrame(hmc_samples['phi'][:,:,0])
m.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()

Unnamed: 0,count,mean,std,min,5%,25%,50%,75%,95%,max
0,2000.0,0.360312,10.261937,-32.355705,-15.955612,-6.793513,0.230579,7.447685,17.548048,37.224907
1,2000.0,11.574994,2.344521,2.988956,7.606329,9.967288,11.586669,13.157027,15.47024,19.021179
