# Logistic Regression

In [None]:
from timeit import default_timer as timer

import numpy as np
import sklearn.datasets as skd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import pandas as pd
import pickle

from utils import SEED

## 2. Dataset examples

Toy dataset 

Adapted from from [2]

In [None]:
def build_toy_dataset(N, D = 1, noise_std=0.1):    
    X = np.concatenate([np.linspace(-6, -5, num=5), np.linspace(2, 6, num=N-5)])
    y = np.tanh(X) + np.random.normal(0, noise_std, size=N)
    y[y < 0.5] = 0
    y[y >= 0.5] = 1
    X = (X - 4.0) / 4.0
    X = X.reshape((N, D))
    return X, y.astype(int)
  
N1 = 100
D1 = 1
X1, y1 = build_toy_dataset(N1, D=1, noise_std=0.1)

Simple classification

In [None]:
N2 = 1000
D2 = 20
X2, y2 = skd.make_classification(n_samples=N2, n_features=D2, n_redundant=5, 
                           n_informative=15, random_state=7, n_clusters_per_class=1)
X2_train,X2_test,y2_train,y2_test = ms.train_test_split(X2,y2,test_size=0.2,random_state=42)

Moons 

In [None]:
N3 = 1000
D3 = 2
X3, y3 = skd.make_moons(n_samples=N3, noise=0.1, random_state=42)
X3_train,X3_test,y3_train,y3_test = ms.train_test_split(X3,y3,test_size=0.2,random_state=42)

 Circles

In [None]:
N4 = 1000
D4 = 2
X4, y4 = skd.make_circles(n_samples=N4, noise=0.05, factor=0.3, random_state=0)
X4_train,X4_test,y4_train,y4_test = ms.train_test_split(X4,y4,test_size=0.2,random_state=42)

Credit Card Fraud Detection

In [None]:
filename = "data/creditcard.csv"
df = pd.read_csv(filename)

In [None]:
y = np.array(df.Class.tolist())     
df = df.drop('Class', 1)
df = df.drop('Time', 1)     
df['Amount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1,1))

X = np.array(df.values)   

## Fit the model

In [None]:
lrn = LogisticRegression()
accuracy = []
recall = []
precision = []
F1 = []
for seed in SEED:

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
  lrn.fit(X_train, y_train)

  y_prob = lrn.predict_proba(X_test)[:,lrn.classes_[1]]
  y_pred = np.zeros(len(y_prob))

  for i in range(len(y_prob)):
      if y_prob[i] > 0.5:
          y_pred[i] = 1

  total = y_test.shape
  TP = np.sum(y_pred * y_test)
  FP = np.sum(y_pred - y_pred * y_test)
  TN = np.sum((1-y_pred) * (1-y_test))
  FN = total - TP - FP - TN
    
  a = ((TP + TN) / total)[0]
  p = (TP / (TP + FP))
  r = (TP / (TP + FN))[0]
  f = (2 / (1/r + 1/p))
  
  accuracy.append(a)
  precision.append(p)
  recall.append(r)
  F1.append(f)  

## References

[1] Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015

[2] Edward [tutorial](http://edwardlib.org/tutorials/supervised-regression)

[3] Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.