In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import pyro
import pyro.distributions as dist
from pyro.infer import NUTS, MCMC
pyro.enable_validation()
dist.enable_validation(True)

In [None]:
weather_check_data = pd.read_csv('../data/weather_check/weather-check.csv', sep=r',')

In [None]:
# Rename keys to make it easier to manipulate
weather_check_data = weather_check_data.rename(columns=lambda x: x.strip())\
                  .drop(columns={'RespondentID', 'A specific website or app (please provide the answer)'})\
                  .rename(columns={'Do you typically check a daily weather report?': 'daily_check',
                                   'How do you typically check the weather?': 'how_check',
                                   ('If you had a smartwatch (like the soon to be released Apple Watch),'
                                   + ' how likely or unlikely would you be to check'
                                   + ' the weather on that device?'): 'smartwatch',
                                   'Age': 'age',
                                   'What is your gender?': 'gender',
                                   ('How much total combined money did all members of your'
                                   + ' HOUSEHOLD earn last year?'): 'earning',
                                   'US Region': 'us_region'})
weather_check_data

In [None]:
# Remove spaces in row values
for c in weather_check_data.keys():
    weather_check_data[c] = weather_check_data[c].apply(lambda x: x.strip())
# Remove empty answers
weather_check_data = weather_check_data.drop(weather_check_data[weather_check_data['how_check'] == '-'].index)
# Remove rows where age or gender or earning or us_region is missing.
# Note: This makes the data biased, but easier for the exercise to handle.
# For real life use cases, you need to do imputation, like: 
# http://pyro.ai/numpyro/bayesian_imputation.html
weather_check_data = weather_check_data.drop(weather_check_data[
    (weather_check_data['age'] == '-') | (weather_check_data['gender'] == '-') 
    | (weather_check_data['earning'] == '-') 
    | (weather_check_data['earning'] == 'Prefer not to answer') 
    | (weather_check_data['us_region'] == '-')].index)

In [None]:
how_types = {t: i for i, t in enumerate(sorted(weather_check_data['how_check'].unique()))}
print(how_types)
smartwatch_types = {t: i for i, t in enumerate(sorted(weather_check_data['smartwatch'].unique()))}
print(smartwatch_types)
age_types = {t: i for i, t in enumerate(sorted(weather_check_data['age'].unique()))}
print(age_types)
gender_types = {t: i for i, t in enumerate(sorted(weather_check_data['gender'].unique()))}
print(gender_types)
earning_types = {t: i for i, t in enumerate(sorted(weather_check_data['earning'].unique(), key=lambda s: ('up' in s, len(s), s)))}
print(earning_types)
region_types = {t: i for i, t in enumerate(sorted(weather_check_data['us_region'].unique()))}
print(region_types)
daily_types = {t: i for i, t in enumerate(sorted(weather_check_data['daily_check'].unique()))}
print(daily_types)

In [None]:
train, test = train_test_split(weather_check_data)
test.head()

In [None]:
def encode_data_inp(df):
    how = torch.nn.functional.one_hot(torch.from_numpy(df['how_check']
                                      .map(how_types).values), len(how_types))
    smartwatch = torch.from_numpy(df['smartwatch'].map(smartwatch_types).values)
    age = torch.from_numpy(df['age'].map(age_types).values)
    gender = torch.from_numpy(df['gender'].map(gender_types).values)
    earning = torch.from_numpy(df['earning'].map(earning_types).values)
    combined = torch.cat([how, smartwatch.unsqueeze(-1), age.unsqueeze(-1),
                          gender.unsqueeze(-1), earning.unsqueeze(-1)], dim=-1).float()
    region = torch.from_numpy(df['us_region'].map(region_types).values)
    daily_check = torch.from_numpy(df['daily_check'].map(daily_types).values)
    return combined, region, daily_check

In [None]:
num_components = encode_data_inp(weather_check_data)[0].shape[-1]
num_regions = len(region_types)
print("num_components: ", num_components, " num_regions:", num_regions)

In [None]:
def model(covariates, region, response=None):
    pass
    # TODO make model

In [None]:
tX, tr, ty = encode_data_inp(train)
kernel = NUTS(model)
mcmc = MCMC(kernel, 1000, 100)
mcmc.run(tX, tr, ty)

In [None]:
sX, sr, sy = encode_data_inp(test)
samples = mcmc.get_samples(100)
res = []
for i in range(100):
    res.append((torch.sigmoid(sX @ samples['beta'][i] + samples['gamma'][i][sr]) >= 0.5) * 1.)
res = torch.stack(res)
print((1.*(res == sy)).mean())
print((1.*(res == sy)).var())