]\

### Import Libraries

In [32]:
import numpy as np
import pandas as pd
import warnings
# from sklearn import datasets
# import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

%matplotlib inline

### Load data from CSV file

In [33]:
churn_df=pd.read_csv("./ChurnData.csv")
churn_df.head()

Unnamed: 0,tenure,age,address,income,ed,employ,equip,callcard,wireless,longmon,...,pager,internet,callwait,confer,ebill,loglong,logtoll,lninc,custcat,churn
0,11.0,33.0,7.0,136.0,5.0,5.0,0.0,1.0,1.0,4.4,...,1.0,0.0,1.0,1.0,0.0,1.482,3.033,4.913,4.0,1.0
1,33.0,33.0,12.0,33.0,2.0,0.0,0.0,0.0,0.0,9.45,...,0.0,0.0,0.0,0.0,0.0,2.246,3.24,3.497,1.0,1.0
2,23.0,30.0,9.0,30.0,1.0,2.0,0.0,0.0,0.0,6.3,...,0.0,0.0,0.0,1.0,0.0,1.841,3.24,3.401,3.0,0.0
3,38.0,35.0,5.0,76.0,2.0,10.0,1.0,1.0,1.0,6.05,...,1.0,1.0,1.0,1.0,1.0,1.8,3.807,4.331,4.0,0.0
4,7.0,35.0,14.0,80.0,2.0,15.0,0.0,1.0,0.0,7.1,...,0.0,0.0,1.0,1.0,0.0,1.96,3.091,4.382,3.0,0.0


### Data pre-processing and selection

In [34]:
churn_df=churn_df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip',   'callcard', 'wireless','churn']]
churn_df['churn']=churn_df['churn'].astype('int')
churn_df.head()

Unnamed: 0,tenure,age,address,income,ed,employ,equip,callcard,wireless,churn
0,11.0,33.0,7.0,136.0,5.0,5.0,0.0,1.0,1.0,1
1,33.0,33.0,12.0,33.0,2.0,0.0,0.0,0.0,0.0,1
2,23.0,30.0,9.0,30.0,1.0,2.0,0.0,0.0,0.0,0
3,38.0,35.0,5.0,76.0,2.0,10.0,1.0,1.0,1.0,0
4,7.0,35.0,14.0,80.0,2.0,15.0,0.0,1.0,0.0,0


In [35]:
X=np.asarray(churn_df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip']])
print(X[0:5])
print(X.shape)

[[ 11.  33.   7. 136.   5.   5.   0.]
 [ 33.  33.  12.  33.   2.   0.   0.]
 [ 23.  30.   9.  30.   1.   2.   0.]
 [ 38.  35.   5.  76.   2.  10.   1.]
 [  7.  35.  14.  80.   2.  15.   0.]]
(200, 7)


In [36]:
y=np.asarray(churn_df['churn'])
y[0:5]

array([1, 1, 0, 0, 0])

### Normalising data

In [37]:
normalized_data=X.copy()
normalized_data = (normalized_data-normalized_data.min()) /(normalized_data.max()-normalized_data.min())
normalized_data[0:5]

array([[0.00659472, 0.01978417, 0.00419664, 0.08153477, 0.0029976 ,
        0.0029976 , 0.        ],
       [0.01978417, 0.01978417, 0.00719424, 0.01978417, 0.00119904,
        0.        , 0.        ],
       [0.01378897, 0.01798561, 0.00539568, 0.01798561, 0.00059952,
        0.00119904, 0.        ],
       [0.02278177, 0.02098321, 0.0029976 , 0.04556355, 0.00119904,
        0.0059952 , 0.00059952],
       [0.00419664, 0.02098321, 0.00839329, 0.04796163, 0.00119904,
        0.00899281, 0.        ]])

#### Theta
To perform a prediction we use a neural network like notaion. We have weights (w), inputs(x) and bias (b). 
$$ 
z= ( \sum_{i=1}^{n} w_{i}x_{i} ) + b 
$$

For simplicity we will represent (w,b) as $ \theta $

#### Sigmoid Function
$$
 \sigma(z)= \frac{1}{1+\exp^{-z}}
$$


#### Loss
We need a loss function that expresses, for an obersvaton x, how close the classifier output $(\hat{y} = \sigma{(w.x+b)}) $ is to the correct output **y**. Here in this example we have used Cross entropy as our loss function. Cross entropy is basically a bernouli eqaution.

Bernouli Equation in probability:
$$ BE(p,q)=p^y(q)^{1-y} $$
$$ L_{CE} (\hat{y},y)= -\frac {1}{m} \sum_{i=1}^{m}y\log({\hat{y}})+(1-\hat{y})\log({1-\hat{y}}) $$

#### Gradient descent and gradients 


Now, to calculate the gradients to optimize the weights using gradient descent, we must calculate the derivative of the loss function. That is partial derivative of the cross entropy formula
$$ \frac {\partial L_{CE} (\hat{y},y)} {\partial w} = \frac {1}{m}(\hat{y}-y)x_{i}^T $$
$$ \frac {\partial L_{CE} (\hat{y},y)} {\partial b} = \frac {1}{m}(\hat{y}-y) $$

Now since we have our gradients, we can use it to update the weights and biases

##### Derivation
$$ \frac{d}{dx}\ln(x)=\frac{1}{x} $$
$$ \frac{d\sigma(z)}{dz}=\sigma(z)(1-\sigma(z)) $$
By the chain rule of derivatives:-
$$ \frac{df}{dx}=\frac{du}{dv}.\frac{dv}{dx} $$

First, we need to calcualte the derivative of the loss function with respect to a single weight $ w_{j} $ (that is we need to compute the derivative for each weight and bias)

$$ \frac{\partial{L_{CE}}}{\partial{w_{j}}}=\frac{\partial{}}{\partial{w_{j}}}-[y\log\sigma(w.x+b)+(1-y)\log(1-\sigma(w.x+b))] $$
$$ 
  = -\left[ \frac{\partial{}}{\partial{w_{j}}}y\log\sigma(w.x+b)+\frac{\partial{}}{\partial{w_{j}}} (1-y)\log[1-\sigma(w.x+b)] \right ]
$$
Now we need to use the chain rule
$$ 
\frac{\partial{L_{CE}}}{\partial{w_{j}}} = -\frac{y}{\sigma(w.x+b)} \frac{\partial{}}{\partial{w_{j}}}\sigma(w.x+b)-\frac{1-y}{1-\sigma(w.x+b)}\frac{\partial{}}{\partial{w_{j}}}(1-\sigma(w.x+b))
$$
Simplifying the above equation we get,
$$ \frac{\partial{L_{CE}}}{\partial{w_{j}}}= -\left [ \frac{y}{\sigma(w.x+b)}-\frac{1-y}{1-\sigma(w.x+b)} \right]\frac{\partial{}}{\partial{w_{j}}}\sigma(w.x+b)
$$

Now substituting the derivative of the sigmoid function we get,
$$ \frac{\partial{L_{CE}}}{\partial{w_{j}}}= -\left [ \frac{y-\sigma(w.x+b)}{\sigma(w.x+b)[1-\sigma(w.x+b)]}  \right ] \sigma(w.x+b)[1-\sigma(w.x+b)] \frac{\partial(w.x+b)}{\partial{w_{j}}} $$
$$ = -\left [ \frac{y-\sigma(w.x+b)}{\sigma(w.x+b)[1-\sigma(w.x+b)]}  \right ] \sigma(w.x+b)[1-\sigma(w.x+b)] x_{j} $$
$$ = -[y-\sigma(w.x+b)]x_{j} $$
$$ = [\sigma(w.x+b)-y]x_{j} $$
$$ = (\hat{y}-y)x_{j} $$



In [38]:
np.concatenate((np.ones((X.shape[0],1)),X),axis=1)
# X


array([[ 1., 11., 33., ...,  5.,  5.,  0.],
       [ 1., 33., 33., ...,  2.,  0.,  0.],
       [ 1., 23., 30., ...,  1.,  2.,  0.],
       ...,
       [ 1.,  6., 32., ...,  1., 10.,  0.],
       [ 1., 24., 30., ...,  4.,  5.,  0.],
       [ 1., 61., 50., ...,  2., 22.,  1.]])

In [39]:
class LogisticRegression:
    def __init__(self, lr, num_iter, fit_intercept=True, verbose=False):
        self.lr = lr #learning rate
        self.num_iter = num_iter #number of iterations
        self.fit_intercept = fit_intercept #intercept
        self.verbose = verbose  # Verbose means that it will output messages which could be useful for debugging and for understanding how the training is doing

    #weights  
    def add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)

    #sigmoid function    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    #cross entropy loss    
    def loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

    #fitting the model    
    def fit(self, X, y):
        if self.fit_intercept: #bias initialization
            X = self.add_intercept(X)

        # weights initialization
        self.theta = np.zeros(X.shape[1]) #bias values

        for i in range(self.num_iter):
            z = np.dot(X, self.theta) #w.x+b
            h = self.sigmoid(z) 
            gradient = np.dot(X.T, (h - y)) / y.size #following the formula above (y.size=m), x.T is calculating the transpose of the matrix for matrix multiplication
            self.theta -= self.lr * gradient #updating the weights by moving to the minimum by a small rate.

            if(self.verbose == True and i % 100000 == 0):
                z = np.dot(X, self.theta)
                h = self.sigmoid(z)
                print(f'loss: {self.loss(h, y)} \t')

    #These two functions will predict the correct labels and show us the probabilty of correct labels predicted.            
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.add_intercept(X)

        return self.sigmoid(np.dot(X, self.theta))

    def predict(self, X):
        return self.predict_prob(X).round()


In [28]:
model=LogisticRegression(lr=0.01,num_iter=2000000,verbose=True)
%time model.fit(normalized_data,y)

loss: 0.6927042186454124 	
loss: 0.5910096716660708 	
loss: 0.5838561953745031 	
loss: 0.5779887445467037 	
loss: 0.5729061990167699 	
loss: 0.5683919821502402 	
loss: 0.564322571262061 	
loss: 0.5606177537705266 	
loss: 0.5572212812637848 	
loss: 0.5540915323955028 	
loss: 0.5511964079618044 	
loss: 0.5485103102805649 	
loss: 0.5460122548772475 	
loss: 0.5436846402124331 	
loss: 0.5415124191628589 	
loss: 0.5394825252637198 	
loss: 0.5375834654712491 	
loss: 0.5358050245217261 	
loss: 0.5341380456807027 	
loss: 0.5325742647612982 	
Wall time: 25.4 s


In [29]:
model.add_intercept(X)

array([[ 1., 11., 33., ...,  5.,  5.,  0.],
       [ 1., 33., 33., ...,  2.,  0.,  0.],
       [ 1., 23., 30., ...,  1.,  2.,  0.],
       ...,
       [ 1.,  6., 32., ...,  1., 10.,  0.],
       [ 1., 24., 30., ...,  4.,  5.,  0.],
       [ 1., 61., 50., ...,  2., 22.,  1.]])

In [30]:
model.theta

array([  0.28961352, -29.03375129, -12.90644264,  -9.13661946,
        -4.55567855,   1.52032046, -10.9023424 ,   0.70800146])

In [31]:
preds=model.predict(X)
(preds==y).mean()

0.71