# Bayesian Logistic Regression in Stan

In [None]:
from timeit import default_timer as timer
from io import BytesIO

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import pandas as pd
import pickle

import pystan

from utils import count_divergences, compute_metrics, SEED

## 1. Data

Credit Card Fraud Detection [1]

In [4]:
%gcs read --object "gs://thesis-203306/data/creditcard.csv" --variable csv_as_bytes

In [5]:
df = pd.read_csv(BytesIO(csv_as_bytes))

y = np.array(df.Class.tolist())     
df = df.drop('Class', 1)
df = df.drop('Time', 1)     
df['Amount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1,1))

X = np.array(df.values)   

Toy dataset

Adapted from [2]

In [3]:
def build_toy_dataset(N, D = 1, noise_std=0.1):    
    X = np.concatenate([np.linspace(-6, -5, num=5), np.linspace(2, 6, num=N-5)])
    y = np.tanh(X) + np.random.normal(0, noise_std, size=N)
    y[y < 0.5] = 0
    y[y >= 0.5] = 1
    X = (X - 4.0) / 4.0
    X = X.reshape((N, D))
    return X, y.astype(int)
  
N1 = 100
D1 = 1
X1, y1 = build_toy_dataset(N1, D=1, noise_std=0.1)

## 2. Model

In [None]:
# Load the precompiled model
sm = pickle.load(open('bay_log_reg.pkl', 'rb'))

In [None]:
# Alternatively, compile the model

# stan_code = """
# data {
#   int<lower=0> N; 
#   int<lower=0> D; 
#   int<lower=0, upper=1> y[N];
#   matrix[N,D] X; 
# }

# parameters {
#   vector[D] w; 
#   real b; 
# }

# model {  
#   w ~ normal(0, 3);
#   b ~ normal(0, 3);
#   for (n in 1:N)
#       y[n] ~ bernoulli_logit(dot_product(X[n],w) + b);
# }
# """
# sm = pystan.StanModel(model_code=stan_code)

# # Save model to file
# with open('bay_log_reg.pkl', 'wb') as f:
#     pickle.dump(sm, f)

## 3. Inference

### NUTS

In [220]:
def stan_nuts(X,y,filename,seeds=SEED):
  """
  Runs Stan NUTS algorithm
  Default: 1000 iterations (500 warmup)
  """
  for seed in seeds:
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=seed)
    N,D = X_train.shape    
    stan_data = dict(N=N, D=D, X=X_train, y=y_train)
    start = timer()
    fit = sm.sampling(iter=1000, warmup=500, data=stan_data, chains=2, seed=seed)
    end = timer()
    e = fit.extract(permuted=False).mean(axis=1)
    w = e[:,:D]
    b = e[:,D]
    
    F1 = []
    accuracy = []
    recall = []
    precision = []
    for ww,bb in zip(w,b):
      a, p, r, f = compute_metrics(ww,bb,X_test,y_test)
      accuracy.append(a)
      precision.append(p)
      recall.append(r)
      F1.append(f)

    results = {'w': w, 'b': b, 'iters': 1000, 'warmup': 500, 'divergences': count_divergences(fit),
              'accuracy': accuracy, 'precision': precision, 'recall': recall, 'F1': F1, 'time': end-start} 
    
    with open('results/stan/{}_{}.pkl'.format(filename, seed), 'wb') as f:
      pickle.dump(results, f)

In [None]:
# toy example
stan_nuts(X1,y1,'toy_example',seeds=SEED)

In [None]:
# credit card
stan_nuts(X,y,'nuts_credit')

## ADVI

In [7]:
def stan_vi(X, y, filename, seeds=SEED):
  """
  Runs Stan ADVI algorithm
  """
  for seed in seeds:
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=seed)
    N,D = X_train.shape   
    stan_data = dict(N=N, D=D, X=X_train, y=y_train)
    
    F1 = []
    accuracy = []
    recall = []
    precision = []
    times = []
    
    iters = np.linspace(500, 10000, 5).astype(int)
    for it in iters:
      start = timer()
      fit = sm.vb(data=stan_data, algorithm='meanfield', iter=it,
                              tol_rel_obj=0.0001, seed=seed, output_samples=1000)
      end = timer()
      
      w = fit['mean_pars'][:D]
      b = fit['mean_pars'][D]
      
      a, p, r, f = compute_metrics(w,b,X_test,y_test)
      accuracy.append(a)
      precision.append(p)
      recall.append(r)
      F1.append(f)
      times.append(end-start)
      
      samples = {'w': np.array(fit['sampler_params'][:D]), 'b': np.array(fit['sampler_params'][D])}
      with open('results/stan/{}_{}_samples.pkl'.format(filename, seed), 'ab') as f:
        pickle.dump(samples, f)
      
    results = {'iters': iters, 'times': times, 'accuracy': accuracy, 'precision': precision,
               'recall': recall, 'F1': F1, 'tol': 0.0001}
    print('Done for ', seed)  
    with open('results/stan/{}_{}.pkl'.format(filename, seed), 'wb') as f:
      pickle.dump(results, f)
      
  print('Done')

In [None]:
# toy example
stan_vi(X1, y1, 'vi_toy_example', seeds=SEED)

In [None]:
# credit card
stan_vi(X, y, 'vi_credit', seeds=SEED)

## References

[1] Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015

[2] Edward [tutorial](http://edwardlib.org/tutorials/supervised-regression)

[3] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.