# Simple Model
In our simpler model, we will just model each post as posting about a story coming from one of three groups:
- Factual, Disputed Story
- Fake, Disputed Story
- Corrective Story

Have found this to be a useful resource for a hierarchcal model example: https://github.com/pyro-ppl/pyro/blob/dev/examples/baseball.py
As well as https://pyro.ai/examples/forecasting_iii.html

# To start, we will use dummy data

In [56]:
import pandas as pd
import torch
import pyro
from pyro.infer import MCMC, NUTS
import pyro.distributions as dist
from pyro.infer.mcmc.util import initialize_model, summary

In [57]:
pyro.enable_validation(__debug__)
pyro.set_rng_seed(0)

In [58]:
data = pd.DataFrame({"Type": ["Fake", "Fact", "Corrective", "Fake", "Fact", "Corrective", "Fake", "Fact", "Corrective"],
                     "CommentsFirstHour": [100, 50, 20, 250, 100, 40, 125, 150, 30],
                     "Engagement": [1000, 800, 300, 3000, 2500, 500, 1500, 1600, 1000]})
data

Unnamed: 0,Type,CommentsFirstHour,Engagement
0,Fake,100,1000
1,Fact,50,800
2,Corrective,20,300
3,Fake,250,3000
4,Fact,100,2500
5,Corrective,40,500
6,Fake,125,1500
7,Fact,150,1600
8,Corrective,30,1000


In [59]:
data = torch.Tensor([[[100, 1000], [250, 3000], [125, 1500]],
                     [[50,  800],  [100, 2500], [150, 1600]],
                     [[20,  300],  [40,  500],  [30,  1000]]])
# dim 0: Type: (Fake, Fact, Corrective)
# dim 1: post
# dim 2: obs (vars): (commentsFirstHour, Engagement)

In [60]:
data

tensor([[[ 100., 1000.],
         [ 250., 3000.],
         [ 125., 1500.]],

        [[  50.,  800.],
         [ 100., 2500.],
         [ 150., 1600.]],

        [[  20.,  300.],
         [  40.,  500.],
         [  30., 1000.]]])

In [61]:
x = data[:,:,:1]
y = data[:,:,1]

In [62]:
x

tensor([[[100.],
         [250.],
         [125.]],

        [[ 50.],
         [100.],
         [150.]],

        [[ 20.],
         [ 40.],
         [ 30.]]])

In [63]:
y

tensor([[1000., 3000., 1500.],
        [ 800., 2500., 1600.],
        [ 300.,  500., 1000.]])

In [64]:
# x is a 2D tensor of num
def model(x, y):
    num_types, num_posts, num_indeps = x.shape
    
    # construct necessary plates over each level
    type_plate = pyro.plate("type", num_types, dim=-2)
    post_plate = pyro.plate("post", num_posts, dim=-1)
    
    # sample some kind of shared variables here.
#     pyro.sample

    with type_plate:
        type_level = pyro.sample("type_level", dist.Normal(0, 10))
    
    prediction = type_level
    return prediction

In [65]:
nuts_kernel = NUTS(model)

mcmc = MCMC(nuts_kernel, num_samples=2000, warmup_steps=250)
mcmc.run(x, y)

hmc_samples = {k: v.detach().cpu().numpy() for k, v in mcmc.get_samples().items()}

Warmup:   0%|          | 0/2250 [00:00, ?it/s]Warmup:   0%|          | 11/2250 [00:00, 103.30it/s, step size=5.37e+00, acc. prob=0.702]Warmup:   1%|          | 28/2250 [00:00, 115.79it/s, step size=6.46e+00, acc. prob=0.759]Warmup:   2%|▏         | 46/2250 [00:00, 129.63it/s, step size=8.56e+00, acc. prob=0.774]Warmup:   3%|▎         | 65/2250 [00:00, 142.66it/s, step size=3.70e+00, acc. prob=0.773]Warmup:   4%|▎         | 79/2250 [00:00, 138.06it/s, step size=5.20e+00, acc. prob=0.778]Warmup:   4%|▍         | 98/2250 [00:00, 149.64it/s, step size=6.38e+00, acc. prob=0.782]Warmup:   5%|▌         | 113/2250 [00:00, 131.13it/s, step size=1.38e+00, acc. prob=0.780]Warmup:   6%|▌         | 127/2250 [00:00, 106.81it/s, step size=5.83e-01, acc. prob=0.779]Warmup:   6%|▌         | 140/2250 [00:01, 107.68it/s, step size=3.76e-01, acc. prob=0.779]Warmup:   7%|▋         | 153/2250 [00:01, 111.10it/s, step size=4.64e-01, acc. prob=0.781]Warmup:   7%|▋         | 165/2250 [00:01, 112.32

In [66]:
def get_summary_table(posterior, sites, player_names, transforms={}, diagnostics=False, group_by_chain=False):
    """
    Return summarized statistics for each of the ``sites`` in the
    traces corresponding to the approximate posterior.
    """
    site_stats = {}

    for site_name in sites:
        marginal_site = posterior[site_name].cpu()

        if site_name in transforms:
            marginal_site = transforms[site_name](marginal_site)

        site_summary = summary({site_name: marginal_site}, prob=0.5, group_by_chain=group_by_chain)[site_name]
        if site_summary["mean"].shape:
            site_df = pd.DataFrame(site_summary, index=player_names)
        else:
            site_df = pd.DataFrame(site_summary, index=[0])
        if not diagnostics:
            site_df = site_df.drop(["n_eff", "r_hat"], axis=1)
        site_stats[site_name] = site_df.astype(float).round(2)

    return site_stats

In [67]:
# # Utility function to print latent sites' quantile information.
# def summary(samples):
#     site_stats = {}
#     for site_name, values in samples.items():
#         marginal_site = pd.DataFrame(values)
#         describe = marginal_site.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()
#         site_stats[site_name] = describe[["mean", "std", "5%", "25%", "50%", "75%", "95%"]]
#     return site_stats

In [68]:
for site, values in summary(hmc_samples).items():
    print("Coefficient: {}".format(site))
    print(values, "\n")

ValueError: Must pass 2-d input

In [69]:
get_summary_table(mcmc.get_samples(group_by_chain=True),
                                   sites=["type_level"],
                                   player_names=["Fake", "Fact", "Corr"],
                                   diagnostics=True,
                                   group_by_chain=True)["type_level"]

TypeError: summary() got an unexpected keyword argument 'prob'