# Bayesian Logistic Regression in PyMC3

In [1]:
from io import BytesIO
from timeit import default_timer as timer

from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import sklearn.datasets as skd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import pandas as pd
import pickle

import pymc3 as pm

from utils import compute_metrics, SEED

  from ._conv import register_converters as _register_converters


## 1. Data 

Credit Card Fraud Detection [1]

In [56]:
%gcs read --object "gs://thesis-203306/data/creditcard.csv" --variable csv_as_bytes

In [61]:
df = pd.read_csv(BytesIO(csv_as_bytes))

df = df.drop('Time', 1)     # optional
df['Amount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1,1))

## 2. Model

In [62]:
def PyMC3_model(df):
    model = pm.Model()
    
    with model:
        pm.glm.GLM.from_formula('Class ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + V11 + V12 + V13 + V14 + V15 + V16 + V17 + V18 + V19 + V20 + V21 + V22 + V23 + V24 + V25 + V26 + V27 + V28 + Amount', df_train, 
                                family=pm.glm.families.Binomial())
    return model

## 3. Inference

### NUTS

In [99]:
def pymc3_nuts(df, filename, seeds=SEED):
  """
  Runs PyMC3 NUTS algorithm
  Default: 1000 iterations (500 warmup)
  Remark: Using default init (jitter+adapt_diag) can lead to bad initial values,
  so we use only adapt_diag
  """
  for seed in seeds:
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=seed) 
    pymc3_model = PyMC3_model(df_train)

    with pymc3_model:
      start = timer()
      trace = pm.sample(n=500, tune=500, chains=2, seed=seed, init='adapt_diag')
      end = timer()
    
    # Compute the scores
    y_test = np.array(df_test.Class.tolist())     
    df_test = df_test.drop('Class', 1)
    X_test = np.array(df_test.values)
    
    w = []
    for i in range(1,29):
      param = 'V{}'.format(i)
      w.append(np.array(trace.get_values(param,combine=False)).mean(axis=0))
    w.append(np.array(trace.get_values('Amount',combine=False)).mean(axis=0))
    
    w = np.array(w)
    w = w.T
    b = np.array(trace.get_values('Intercept',combine=False)).mean(axis=0)

    F1 = []
    accuracy = []
    recall = []
    precision = []
    for ww,bb in zip(w,b):
      a, p, r, f = compute_metrics(ww,bb,X_test,y_test)
      accuracy.append(a)
      precision.append(p)
      recall.append(r)
      F1.append(f)

    results = {'w': w, 'b': b, 'iters': 1000, 'warmup': 500, 'divergences': int(trace['diverging'].nonzero()[0].size),
              'accuracy': accuracy, 'precision': precision, 'recall': recall, 'F1': F1, 'time': end-start} 
    
    with open('results/pymc3/{}_{}_new.pkl'.format(filename, seed), 'wb') as f:
      pickle.dump(results, f)
    
  print('Done')

In [None]:
# Credit
pymc3_nuts(df, 'nuts_credit', seeds=SEED)

In [115]:
def pymc3_vi(df, filename, seeds=SEED):
  """
  Runs PyMC3 ADVI algorithm
  """
  for seed in seeds:
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=seed)
    
    y_test = np.array(df_test.Class.tolist()) 
    df_test = df_test.drop('Class', 1)
    X_test = np.array(df_test.values)
    
    pymc3_model = PyMC3_model(df_train)
    
    F1 = []
    accuracy = []
    recall = []
    precision = []
    times = []
    
    iters = np.linspace(500, 10000, 5).astype(int)
    for n in iters:
      with pymc3_model:
        start = timer()
        advi_fit = pm.fit(n, random_seed=seed, callbacks=[pm.callbacks.CheckParametersConvergence(diff='absolute', tolerance=0.0001)])
        end = timer()
    
      print('Sampling done')
      times.append(end-start)

      trace_advi = advi_fit.sample(draws=1000)
      w = []
      for i in range(1,29):
        param = 'V{}'.format(i)
        w.append(trace_advi[param].mean())
      w.append(trace_advi['Amount'].mean())
      b = trace_advi['Intercept'].mean()

      a, p, r, f = compute_metrics(w,b,X_test,y_test)
      accuracy.append(a)
      precision.append(p)
      recall.append(r)
      F1.append(f)
        
    results = {'iters': iters, 'tol': 0.0001, 'accuracy': accuracy, 'precision': precision, 
               'recall': recall, 'F1': F1, 'times': times} 
    
    with open('results/pymc3/{}_{}.pkl'.format(filename, seed), 'wb') as f:
      pickle.dump(results, f)  

In [None]:
pymc3_vi(df, 'vi_credit', seeds=SEED)

## References

[1] Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015

[2] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.