In [1]:
import os
from functools import partial
import torch
import torch.nn as nn

import pyro
from pyro.distributions import Normal, Uniform, Delta
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam
from pyro.distributions.util import logsumexp
from pyro.infer import EmpiricalMarginal, TracePredictive
from pyro.infer.mcmc import MCMC, NUTS
import pyro.optim as optim
import pyro.poutine as poutine

# for CI
smoke_test = ('CI' in os.environ)
assert pyro.__version__.startswith('0.4.1')
pyro.enable_validation(True)
pyro.set_rng_seed(1)
pyro.enable_validation(True)

In [2]:
"""Loading data"""

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
import torch


def load_tensor_data(fileloc):
    """
    Helper function to load the actors data, filter by criterias of 1 million
    min. revenue and actors in at least 20 movies. Returns actor matrix and
    logNormal revenue as torch tensors.
    """
    # Log transform revenue first
    # Log transform output
    data_actors = pd.read_csv(fileloc, index_col=0)
    X = data_actors.iloc[:, 2:]
    X_data = torch.Tensor(X.to_numpy(dtype='float32'))
    transformer = FunctionTransformer(np.log1p, validate=True)
    data_actors["log_revenue"] = transformer.transform(
        data_actors["revenue"].values.reshape(-1, 1)
    )
    Y_data = torch.Tensor(
        data_actors["log_revenue"].to_numpy().reshape(X.shape[0], 1)
    )

    cols_20 = ['title_x', 'revenue', 'log_revenue']
    for col in data_actors.columns[2:-1]:
        if np.sum(data_actors[col]) >= 20:
            cols_20.append(col)

    data_million = data_actors[cols_20]
    data_million = data_million[data_million["revenue"] > 1000000]
    X_all = data_million[
        data_million.columns.difference(
            ['title_x', 'revenue', 'log_revenue']
        )
    ]

    x_train = X_all
    x_train_tensors = torch.tensor(x_train.to_numpy(dtype='float32'))
    y_train_tensors = torch.tensor(
        data_million['log_revenue'].to_numpy(dtype='float32')
    )

    return x_train_tensors, y_train_tensors, x_train.columns

In [3]:
# Load data from the dataframe
x_train_tensors, y_train_tensors, actors = load_tensor_data("../data/ohe_movies.csv")

In [4]:
# Lets define a new regression model
class RegressionModel(nn.Module):
    
    def __init__(self, p):
        super(RegressionModel, self).__init__()
        self.linear = nn.Linear(p, 1)
#         self.factor = nn.Parameter(torch.tensor(1.0))
        
    def forward(self, x):

        return self.linear(x).reshape(3181)
    
    
p = 129  # number of actors
regression_model = RegressionModel(p)

In [5]:
def model(x_data, y_data):
    
    dims = x_data.shape[1]
    # weight, bias, and factor priors
    w_prior = Normal(torch.zeros(1, dims), torch.ones(1, dims)).to_event(1)
    b_prior = Normal(torch.tensor([[8.]]), torch.tensor([[1000.]])).to_event(1)
#     f_prior = Normal(0., 1.)

    
    priors = {'linear.weight': w_prior, 'linear.bias': b_prior}
    scale = pyro.sample("sigma", Uniform(0., 10.))
    
    lifted_module = pyro.random_module("module", regression_model, priors)
    lifted_reg_model = lifted_module()

    
#     with pyro.plate("map", len(x_data)):
        
    with pyro.plate("map", len(x_data)):
        
        prediction_mean = lifted_reg_model(x_data)

        
        pyro.sample("obs",
                   Normal(prediction_mean, scale))

        return prediction_mean

In [6]:
from pyro.infer.autoguide import AutoDiagonalNormal
# initialize the autodiagonal with init_to_feasible instead of init_to_median
from pyro.infer.autoguide import init_to_feasible

In [7]:
optim = Adam({"lr": 0.03})
cond_model = pyro.condition(model, data = {"obs" : y_train_tensors.reshape(3181)})
guide = AutoDiagonalNormal(cond_model, init_loc_fn = init_to_feasible)
svi = SVI(cond_model, guide, optim, loss=Trace_ELBO(), num_samples=10000)

In [8]:
pyro.set_rng_seed(101)
num_iterations = 1000
def train():
    pyro.clear_param_store()
    for j in range(num_iterations):
        loss = svi.step(x_train_tensors, y_train_tensors.reshape(1, 3181))
        if j % 100 == 0:
            print("[iteration %04d] loss: %.4f" % (j + 1, loss/len(x_train_tensors)))
            
train()

[iteration 0001] loss: 5.1506
[iteration 0101] loss: 4.9681
[iteration 0201] loss: 5.1123
[iteration 0301] loss: 4.4013
[iteration 0401] loss: 4.0884
[iteration 0501] loss: 3.8867
[iteration 0601] loss: 3.8461
[iteration 0701] loss: 3.7510
[iteration 0801] loss: 3.6523
[iteration 0901] loss: 3.5138


In [9]:
for name, value in pyro.get_param_store().items():
    print(name, pyro.param(name), pyro.param(name).shape)

auto_loc tensor([1.6516, 2.4027, 2.2196, 1.7156, 1.7994, 1.8234, 1.8741, 2.0486, 1.8617,
        2.2979, 2.5839, 2.1454, 2.3602, 2.0813, 1.6913, 2.2390, 2.1334, 2.5048,
        1.7655, 2.1267, 1.8169, 1.5912, 3.0452, 2.4081, 1.7902, 1.3663, 1.9339,
        1.7733, 1.6583, 1.9213, 1.7080, 1.6151, 1.7449, 1.9513, 2.3523, 1.5401,
        1.3279, 1.9502, 1.5366, 2.0597, 1.7777, 2.4449, 2.0713, 1.9682, 1.9513,
        1.5997, 1.8520, 1.9464, 2.6937, 1.9294, 1.9967, 1.5112, 1.9563, 1.9197,
        1.9054, 1.7290, 1.7937, 1.5480, 1.7804, 1.3770, 1.6052, 1.7849, 1.9197,
        2.2705, 1.5669, 1.9968, 2.0442, 1.6663, 2.1612, 1.4873, 2.8832, 1.5932,
        1.6935, 2.0370, 1.6395, 1.9602, 1.6455, 1.7792, 1.9753, 1.5651, 1.6396,
        1.5686, 2.2344, 2.2318, 1.3054, 1.6382, 2.3079, 2.7425, 2.0911, 2.1377,
        1.7925, 1.9239, 3.1586, 2.7260, 1.8751, 2.2383, 2.0208, 1.5134, 2.3350,
        1.8010, 1.7736, 1.6835, 3.2053, 1.9118, 1.9059, 1.7612, 2.3296, 2.0273,
        1.8584, 3.0875, 1.9990,

In [11]:
pyro.get_param_store().save("linreg_params")

In [None]:
# Save params 