# Single Level Model
In our simplest model, we will just model each post.

Have found this to be a useful resource for a hierarchcal model example: https://github.com/pyro-ppl/pyro/blob/dev/examples/baseball.py
As well as https://pyro.ai/examples/forecasting_iii.html

# To start, we will use dummy data

In [10]:
import json
import numpy as np
import pandas as pd
import torch

In [15]:
# import torch
# import pyro
# from pyro.infer import MCMC, NUTS
# import pyro.distributions as dist
# from pyro.distributions.util import scalar_like
# from torch.distributions import constraints
# import json

In [16]:
# pyro.enable_validation(__debug__)
# pyro.set_rng_seed(0)

First, we load the Reddit datasets

In [3]:
comments = dict()
with open('../data/results/Comments.json') as f:
    for line in f:
        post = json.loads(line)
        comments[post['pid']] = post['api_num_comments'], post['comments']

In [4]:
corrections = []
with open('../data/results/CorrectionPairs.json') as f:
    for line in f:
        corrections.append(json.loads(line))

In [5]:
news = []
with open('../data/results/NewsPairs.json') as f:
    for line in f:
        news.append(json.loads(line))

Gather relevant variables

In [6]:
def processData(data, items, comments, minutes=60, offset=0):
    for idx, n in enumerate(items):
        i = idx + offset
        
        isNews = 'isFakeStory' in n['r']['reviewRating']
        news_id = n['p']['id']
        
        num_cmts, cmts = comments[news_id]
        c_body_lens = []
        c_ups = []
        c_downs = []
        unique_authors = set()
        for c in cmts:
            # skip if comment not created in first _ mins
            if c['delta_seconds'] > minutes * 60:
                continue

            c_minutes = c['delta_seconds'] / 60
            if c_minutes <= minutes:   # cmts in first _ mins
                data[i, 7] += 1
            
            c_body_lens.append(c['body_len'])
            c_ups.append(c['ups'])
            c_downs.append(c['downs'])
            
            if c['author']:
                unique_authors.add(c['author'])

        data[i, 0] = num_cmts
        if isNews:
            data[i, 1] = 1 if n['r']['reviewRating']['isFakeStory'] else 0
        else:
            data[i, 1] = 3 if n['r']['reviewRating']['isFakeClaim'] else 2
        data[i, 2] = np.mean(c_body_lens) if c_body_lens else 0.
        data[i, 3] = np.std(c_body_lens) if c_body_lens else 0.
        data[i, 4] = np.mean(c_ups) if c_ups else 0.
        data[i, 5] = np.std(c_ups) if c_ups else 0.
        data[i, 6] = len(unique_authors) if unique_authors else 0.
    return data

In [None]:
import torch

In [11]:
num_p_indep = 8
data = torch.zeros((len(news) + len(corrections), num_p_indep))

data = processData(data, news, comments)
data = processData(data, corrections, comments, offset=len(news))

Variables (in order):

0. num_comments
1. type
2. comment_length_avg
3. comment_length_std
4. comment_upvotes_avg
5. comment_upvotes_std
6. num_unique_comment_authors
7. num_comments in first _ mins

In [12]:
# select relevant indep vars
p_data = data[:, (2,4,6,7)]  # avg cmt length, avg upvotes, num authors
t_data = data[:,1].reshape(-1,1)

# add bias terms
biases = torch.ones_like(t_data)
p_data = torch.cat((biases, p_data), dim=1)
t_data = torch.cat((biases, t_data), dim=1)

# get dep var
y = data[:,0].reshape(-1,1)

-----

# USING NUMPYRO

Post-Level Regression

y_p = phi_0 * bias + phi_1 * first_hour_comments + epsilon

In [24]:
import numpyro
import jax.numpy as jnp
from numpyro.infer import MCMC, NUTS, Predictive
import numpyro.distributions as dist
from jax import random

In [18]:
p_data = np.array(p_data)
y = np.array(y)

In [26]:
# x is a 3D tensor
def model(p_data, y):
    num_posts, num_p_indeps = p_data.shape

    # define a prior for our regression variables
    phi_prior = dist.Normal(np.zeros((num_p_indeps, 1)),
                            10. * np.ones((num_p_indeps, 1)))  # (num_p_indeps, 1)
    phi = numpyro.sample("phi", phi_prior)  # (num_p_indeps, 1)
    
    # for each post, use the correct set of coefficients to run our post-level regression
    with numpyro.plate("post", num_posts, dim=-1) as p:

        # indep vars for this post
        indeps = p_data[p,:]  # (num_posts, num_p_indeps)
        
        # calculate the mean
#        mu = torch.matmul(indeps, phi)  # (num_posts, num_p_indeps) (num_p_indeps, 1)
        mu = jnp.matmul(indeps, phi)  # (num_posts, num_p_indeps) (num_p_indeps, 1)
        
        # sample
        numpyro.sample("obs", dist.Normal(mu, 1000.), obs=y[p])  # (num_posts, 1)

In [None]:
nuts_kernel = NUTS(model)

mcmc = MCMC(nuts_kernel, num_samples=2000, num_warmup=1000)
rng_key = random.PRNGKey(0)
mcmc.run(rng_key, p_data, y)

warmup:   3%|▎         | 84/3000 [32:08<185:29:10, 229.00s/it, 1023 steps of size 4.66e-09. acc. prob=0.67]

In [None]:
import arviz as az
data = az.from_pyro(posterior)
az.plot_trace(data, compact=True);

# NUMPYRO EXPERIMENT END

----

In [83]:
# nuts_kernel = NUTS(model)
# mcmc = MCMC(nuts_kernel, num_samples=1000, warmup_steps=1000)
# mcmc.run(p_data, y)

# print(mcmc.summary())
# samples = mcmc.get_samples()

Sample: 100%|██████████| 2000/2000 [00:08, 232.45it/s, step size=9.69e-01, acc. prob=0.895]


                mean       std    median      5.0%     95.0%     n_eff     r_hat
  phi[0,0]      6.24     10.17      6.00     -9.96     22.84    921.20      1.00
  phi[1,0]      9.11      0.69      9.13      7.98     10.20    862.47      1.00

Number of divergences: 0
None





In [27]:
# Utility function to print latent sites' quantile information.
def summary_types(samples):
    site_stats = {}
    i = 0
    for site_name, values in samples.items():
        marginal_site = pd.DataFrame(values)
        describe = marginal_site.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()
        site_stats[site_name] = describe[["mean", "std", "5%", "25%", "50%", "75%", "95%"]]
        i += 1
    return site_stats

In [29]:
m = pd.DataFrame(posterior_samples['phi'][:,:,0])
m.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()

Unnamed: 0,count,mean,std,min,5%,25%,50%,75%,95%,max
0,2000.0,0.047978,9.97055,-30.814848,-16.604683,-6.534851,-0.109931,6.536325,16.897712,34.151043
1,2000.0,11.473513,2.381326,2.492058,7.64638,9.919976,11.44216,13.075683,15.377494,19.758049


In [29]:
m = pd.DataFrame(hmc_samples['phi'][:,:,0])
m.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()

Unnamed: 0,count,mean,std,min,5%,25%,50%,75%,95%,max
0,2000.0,0.047978,9.97055,-30.814848,-16.604683,-6.534851,-0.109931,6.536325,16.897712,34.151043
1,2000.0,11.473513,2.381326,2.492058,7.64638,9.919976,11.44216,13.075683,15.377494,19.758049
