**Creating custom estimator function in sklearn**


In [301]:
# Import packages
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
from scipy.special import expit
import scipy.optimize as opt  
import seaborn as sns
%matplotlib inline

# Custom Sklearn function

In [302]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances

In [310]:
class myLogisticRegression(BaseEstimator):

    def __init__(self): #, param1=1, param2=2, theta=3):
        pass
        # self.param1 = param1
        # self.param2 = param2
        # self.theta = theta

    def sigmoid(self, z):
        return (1 / (1 + np.exp(-z)))

    def costFunction(self, theta, X, y):
        # Initialize some useful values
        (m, n) = X.shape
        J = 0
        grad = np.zeros(theta.shape)
        #print("shapes", theta.shape,X.shape, y.shape)
        
        h = self.sigmoid(np.dot(X, theta)).flatten()
        step1 = np.dot(y.T, np.log(h))
        step2 = np.dot((1-y).T, np.log(1-h))
        J = (1/m)*(-step1-step2)
        grad = (1/m)*(np.dot(X.T,(h-y)))
        return (J, grad)

    def fit(self, X, y, theta):

        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)

        (m, n) = X.shape

        (cost, grad) = self.costFunction(theta, X, y)
        print('Initial thetas: ', theta);
        print('Cost at initial theta (zeros): ', cost);
        print('Gradient at initial theta: \n', grad);

        # Run gradient descent
        result = opt.fmin_tnc(func=self.costFunction, x0=theta, args=(X, y))

        self.fitted_theta_ = result[0]
        optimal_theta = result[0]
        
        (cost, grad) = self.costFunction(self.fitted_theta_ , X, y)
        print('Gradient at final theta: \n', grad)
        print('Cost at final theta: ', cost);
        print('final thetas: ', optimal_theta);

        self.X_ = X
        self.y_ = y
        self.fitted_theta_ = y

        # Return the classifier
        return self



In [311]:
X = np.random.normal(0, 1, size=60).reshape([20, 3])
# Add intercept term to x and X_test
X = np.concatenate((np.ones((m,1)), X), axis=1)

# input coefficients
betas = [0.1, 0.5, 1, 2]
y_prob = expit(np.dot(X, betas))
y = np.array(y_prob > 0.5).astype(int)

(m, n) = X.shape
initial_theta = np.zeros((n, 1))



In [312]:
my_model = myLogisticRegression()

In [313]:
my_model.fit(X, y, initial_theta)

Initial thetas:  [[0.]
 [0.]
 [0.]
 [0.]]
Cost at initial theta (zeros):  0.6931471805599453
Gradient at initial theta: 
 [ 0.1        -0.14044735 -0.35111288 -0.35554127]
Gradient at final theta: 
 [-0.00141083  0.00077872 -0.00252574 -0.00090012]
Cost at final theta:  0.00826826879578249
final thetas:  [-0.67292536  6.03019816  7.49616138 11.86148422]


  NIT   NF   F                       GTG
    0    1  6.931471805599453E-01   2.79415307E-01
tnc: stepmx = 1000
    1    5  1.399664067356861E-01   6.71166699E-03
    2    8  2.839803150442382E-02   1.41910649E-04
tnc: fscale = 83.9446
    3   11  1.778317789605917E-02   2.55147283E-05
  step2 = np.dot((1-y).T, np.log(1-h))
    4   38  8.268268919439081E-03   9.78643010E-06
tnc: |fn-fn-1] = 1.23657e-10 -> convergence
    5   90  8.268268795782491E-03   9.78642982E-06
tnc: Converged (|f_n-f_(n-1)| ~= 0)


In [314]:
from sklearn.linear_model import LogisticRegression

In [315]:
model = LogisticRegression(penalty='none')
model = model.fit(X, y)

In [316]:
model.coef_[0]

array([ 0.56292802, 14.10190926, 20.84767108, 38.40944439])