# Logistic Regression

In [2]:
from io import BytesIO

from timeit import default_timer as timer

import numpy as np
import sklearn.datasets as skd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import pandas as pd
import pickle

from utils import SEED

## 2. Dataset examples

Toy dataset 

Adapted from from [2]

In [5]:
def build_toy_dataset(N, D = 1, noise_std=0.1):    
    X = np.concatenate([np.linspace(-6, -5, num=5), np.linspace(2, 6, num=N-5)])
    y = np.tanh(X) + np.random.normal(0, noise_std, size=N)
    y[y < 0.5] = 0
    y[y >= 0.5] = 1
    X = (X - 4.0) / 4.0
    X = X.reshape((N, D))
    return X, y.astype(int)
  
N1 = 100
D1 = 1
X1, y1 = build_toy_dataset(N1, D=1, noise_std=0.1)

Simple classification

In [7]:
N2 = 1000
D2 = 20
X2, y2 = skd.make_classification(n_samples=N2, n_features=D2, n_redundant=5, 
                           n_informative=15, random_state=7, n_clusters_per_class=1)
X2_train,X2_test,y2_train,y2_test = ms.train_test_split(X2,y2,test_size=0.2,random_state=42)

Moons 

In [8]:
N3 = 1000
D3 = 2
X3, y3 = skd.make_moons(n_samples=N3, noise=0.1, random_state=42)
X3_train,X3_test,y3_train,y3_test = ms.train_test_split(X3,y3,test_size=0.2,random_state=42)

 Circles

In [None]:
N4 = 1000
D4 = 2
X4, y4 = skd.make_circles(n_samples=N4, noise=0.05, factor=0.3, random_state=0)
X4_train,X4_test,y4_train,y4_test = ms.train_test_split(X4,y4,test_size=0.2,random_state=42)

Credit Card Fraud Detection

In [3]:
%gcs read --object "gs://thesis-203306/data/creditcard.csv" --variable csv_as_bytes

In [4]:
df = pd.read_csv(BytesIO(csv_as_bytes))
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
y = np.array(df.Class.tolist())     
df = df.drop('Class', 1)
df = df.drop('Time', 1)     
df['Amount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1,1))

X = np.array(df.values)   

## Fit the model

In [15]:
lrn = LogisticRegression()
accuracy = []
recall = []
precision = []
F1 = []
for seed in SEED:

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
  lrn.fit(X_train, y_train)

  y_prob = lrn.predict_proba(X_test)[:,lrn.classes_[1]]
  y_pred = np.zeros(len(y_prob))

  for i in range(len(y_prob)):
      if y_prob[i] > 0.5:
          y_pred[i] = 1

  total = y_test.shape
  TP = np.sum(y_pred * y_test)
  FP = np.sum(y_pred - y_pred * y_test)
  TN = np.sum((1-y_pred) * (1-y_test))
  FN = total - TP - FP - TN
    
  a = ((TP + TN) / total)[0]
  p = (TP / (TP + FP))
  r = (TP / (TP + FN))[0]
  f = (2 / (1/r + 1/p))
  
  accuracy.append(a)
  precision.append(p)
  recall.append(r)
  F1.append(f)  

In [16]:
print(accuracy)
print(precision)
print(recall)
print(F1)

[0.9990695551420246, 0.9991748885221726, 0.9989817773252344]
[0.8289473684210527, 0.859375, 0.8333333333333334]
[0.6116504854368932, 0.5913978494623656, 0.5660377358490566]
[0.7039106145251396, 0.7006369426751593, 0.6741573033707865]


## References

[1] Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015

[2] Edward [tutorial](http://edwardlib.org/tutorials/supervised-regression)

[3] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.