# Required Packages

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

**numpy** : Numerical Computation for faster execution  

**sklearn** : Popular ML library available in Python  

**train_test_split** : This is used to split the available data into 2 parts,  
- 1<sup><b>st</b></sup> into Train Set 
- 2<sup><b>nd</b></sup> into Test test

**sklearn.linear_model** : In the package you will find all the linear model available. We are using in-built LinearRegression library for checking purpose w.r.t our scratch algorithm.  

**sklearn.metrics** : In the package you will find all the evaulation of model i.e how well you are model is performing. We will use Mean Squared Error.

What is Mean Squared Error?  
- Sum of the difference between actual value and your predicted value and squaring the to difference term.  
- Divide it with the number of value a.k.a mean
*Why we do the square the difference?*
    - Squaring always gives a positive value, so the sum will not be zero. 
    - Squaring emphasizes larger differences — a feature that turns out to be both good and bad (think of the effect outliers have).
- Formula:  
`` SUM((actual - predicted)^2) / COUNT(actual_values)``
- Larger the value worst your model and vice versa

**NOTE : We are not going to optimize the model performance here.**

# LINEAR REGRESSION
- Simple relation between dependent and independent variable.

- Simple Linear Regression:  
    `y = mx + b`  
        x = Random Varible  
        m = Slope = rise/run  
        b = y - intercept [ where x = 0 ]  
    
- Multiple Linear Regression:  
    `Y = B1X1 + B2X2 + ..... + BnXn`  
        Bi : Beta Parameters/Weights/Gradients    
        Xi : Features/Columns/Independent  
        Y : Dependent Variable  

- How to find weights/beta parameters?  
    <B>Using Gadient Descent Method.</B>  
    > What is Gradient Descent Method?
        Imagine a bowl which in U in a shape. The best optimal value are at the bottom.
        Now how to get over there. 
        Gradient Descent implementation steps

        Step1: Initialize parameters (weight and bias) with random value or simply zero.Also initialize Learning rate.

        Step2: Calculate Cost function (J)

        Step3: Take Partial Derivatives of the cost function with respect to weights and biases(dW,db).

        Step4: Update Parameter values as:

            Wnew =  W – learning rate * dW
            bnew = b – learning rate * db
        
        Step5: Repeat step2 to step 4 till n no of iterations. With each iteration the value of cost function will                   progressively decrease and eventually become flat value.
        
*Let Breakdown further in terms of coding*

In [2]:
class LinearRegression_Scratch:
    
    intercept = None
    learning_rate = None
    convergence = None
    grads = None
    x = None
    y = None
    
    def __init__(self, intercept = False,alpha = 0.001,convergence = 1e-4):
        self.intercept = intercept
        self.learning_rate = alpha
        self.convergence = convergence
        
    
    def fit(self,x, y):
        self.x = np.array(x)
        self.y = np.array(y)
        if x is not None and y is not None:
            if self.intercept == True:
                interept_value = np.zeros(x.shape[0])
                self.x = np.insert(self.x,0,interept_value,axis=1)
            self.grads = np.random.randn(self.x.shape[0],self.x.shape[1])
            while True:
                self.grads,loss = self._gradient_descent(self.grads)
                new_grads = self.grads - self.learning_rate * self.grads
                
                if np.sum(abs(new_grads - self.grads)) < self.convergence:
                    print("Converged")
                    self.grads = new_grads
                    break
                self.grads = new_grads
        
        return self

                
    def _gradient_descent(self,grads):
        x = np.array(self.x)
        y = np.array(self.y)
        
        pred = np.sum(x.dot(grads.T),axis=1,keepdims=True)
        sqPred = np.power((y - pred),2)
        loss = np.sum(sqPred)/x.shape[0]
        
        return grads,loss
    
    def predict(self,X):
        if self.intercept == True:
            interept_value = np.zeros(X.shape[0])
            X = np.insert(X,0,interept_value,axis=1)
        x = np.array(X)
        pred = np.sum(x.dot(self.grads.T),axis=1,keepdims=True)
        return pred

# Breakdown

## Variable Used :  
|  **Variable** 	|                                                                                                    **Meaning**                                                                                                    	| **Values**                          	|
|:-------------:	|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:	|-------------------------------------	|
| intercept     	| Weather we want to fit the intercept or not.                                                                                                                                                                      	| True or False                       	|
| learning_rate 	| Amount by which you want to increase or decrease <br>  the learning of weights                                                                                                                                    	| Generally in range of 0.1 to 0.0001 	|
| convergence   	| Gradient Descent makes very small changes <br> in your objective function is called convergence, <br> which doesn't mean it has reached the optimal result <br> (but it is really quite quite near, if not on it) 	| Generally 0.001                     	|
| grads         	| Variable to store the weights of the parameters <br> when cost function values are updated.                                                                                                                       	|                                     	|
| x             	| Independent Features                                                                                                                                                                                              	|                                     	|
| y             	| Dependent Features                                                                                                                                                                                                	|                                     	|  

## Functions :
**init** : Used to initialize the values value when you call the constructor. We have initialize the value of learning_rate, convergence & intercept.  
**fit** : We use it to train the model. Basically finding the weights/beta parameter for the feature.  
**gradient** : Here we have implemented gradient descent which in turn help to find the optimal value for our features.  
**predict** : After finding the optimal parameter we use those value of the predict new o/p.  


# DATA - Preparation
Creating own data to test
- Creating 10 **features/independent/columns/X** with 10000 **values/rows** along with 1 **target/dependent/Y**
- This are all generated using numpy.random.rand function of NUMPY.

In [3]:
x = np.random.randn(10000,10)
y = np.random.randn(10000,1)
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.33, random_state = 123)
print(x.shape, y.shape)
print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)

(10000, 10) (10000, 1)
(6700, 10) (6700, 1)
(3300, 10) (3300, 1)


# Gradient Descent CODE Breakdown
        1. grads = np.random.randn(x.shape[0],x.shape[1])
            Here we have taken random value for the initial values. We created a numpy matrix of size x_rows*x_columns
        2. x = np.array(x) # Converting passed values to numpy array
        3. y = np.array(y) # Converting passed values to numpy array
        4. pred = np.sum(x.dot(grads.T),axis=1,keepdims=True)
            Here we are predicting the value y using the randomly initialize weights/gradients/beta values.
            It simply a dot product of the X/independent/features/columns with the gradients/weights/beta value.
            x.dot(grads.T) : This gives us matrix multiplication return for all values at once.
                             Matrix multiplication has one basic rule i.e., no of columns of X must be equal to rows of y
                             Suppose X = (3,4) then y must = (4,5), resultant matrix will be of size (3,5)
                             In general X = (n,m) and y = (m,p) resultant matrix will be (n,p) size matrix
            axis : 1 means to use values of columns not row wise
            keepdims : Basically in python we it always return as (n,) 
                       without any dimension and we want to keep the dimensions for computation.
         5. sqPred = np.power((y - pred),2)
                     (ACTUAL - PREDICTED) ** 2
             np.power : This function is to raise any value to the prescribed power, here we are raisin value to power of 2
                        Why only to power of 2, not 3 or 4?
                        Power of 1 will give you same value, power of 2 will get you rid of the negative value elimination.
                        Power of 3 will give you again negative and power of 4 will get rid of negative values.
                        Example: (2-3)^1 = -1, |2-2| = 0, (2-3)^2 = 1 -> The benefits of squaring include:
                        - Squaring always gives a positive value, so the sum will not be zero.
                        - Squaring emphasizes larger differences — a feature that turns out to be both good and bad (think                           of the effect outliers have).
          6. loss = np.sum(sqPred)/x.shape[0]
             np.sum : Add all of the values. 
             x.shape[0] : gives you total row/point of the data

In [None]:
def gradient_descent(grads,x,y):
        grads = np.random.randn(x.shape[0],x.shape[1])
        x = np.array(x)
        y = np.array(y)
        print('Grads : \n' , pd.DataFrame(grads))
        
        print('Init Wi : ' , grads.shape, ' : ', grads.T.shape)
        print('X Shape : ' , x.shape)
        print('Y Shape : ', y.shape)
        
        pred = np.sum(x.dot(grads.T),axis=1,keepdims=True)
        print('PRED : ' , pred.shape , ' :\n' , pd.DataFrame(pred))
        
        sqPred = np.power((y - pred),2)
        print('SQR Error : \n' , pd.DataFrame(sqPred))
        
        loss = np.sum(sqPred)/x.shape[0]
        print('LOSS : ' , loss)
        
        return grads, loss
        
grads = np.random.randn(x.shape[0],x.shape[1])        
gradient_descent(grads, x,y)

Convergence : As you train your algorithm using gradient descent approach the new weights which are obtained by multiplying the gradient with learning parameters and subtracting it from weight

In [None]:
convergence = 1e-10
learning_rate = 0.001
loss = []
grads = np.random.randn(x.shape[0],x.shape[1])
iterations = 1
while True:
    grads,error = gradient_descent(grads,x,y)
    new_grads = grads - learning_rate * grads

    if np.sum(abs(new_grads - grads)) < convergence:
        print("Converged")
        #grads = new_grads
        #loss.append(error)
        break
        
    if iterations % 100 == 0:
        print("Iteration: %d - Error: %.4f" %(iterations, error))
        
    grads = new_grads
    iterations += 1
    loss.append(error)

In [None]:
lr = LinearRegression_Scratch(intercept = False, alpha = 0.001)
lr.fit(train_x,train_y)
pred = lr.predict(test_x)
mean_squared_error(test_y,pred)

In [None]:

lr_sk = LinearRegression()
lr_sk.fit(train_x,train_y)
pred_sk = lr.predict(test_x)
from sklearn.metrics import accuracy_score, mean_squared_error
mean_squared_error(test_y,pred_sk)

In [None]:
convergence = 1e-10
learning_rate = 0.001
loss = []
grads = np.random.randn(x.shape[0],x.shape[1])
iterations = 1
while True:
    grads,error = gradient_descent(grads,x,y)
    new_grads = grads - learning_rate * grads

    if np.sum(abs(new_grads - grads)) < convergence:
        print("Converged")
        #grads = new_grads
        #loss.append(error)
        break
        
    if iterations % 100 == 0:
        print("Iteration: %d - Error: %.4f" %(iterations, error))
        
    grads = new_grads
    iterations += 1
    loss.append(error)

In [None]:
from matplotlib import pyplot as plt
plt.scatter(x=loss,y=range(len(loss)))

In [None]:
len(loss),np.argmax(loss),np.argmin(loss)

In [None]:
"""
y = b0 + b1 * x
b0 = Σy − b1 Σx * N
b1 =  N Σ(xy) − Σx Σy/ N Σ(x2) − (Σx)2
"""

def simpleLinearRegression_Fit(x,y):
    x = np.array(x)
    y = np.array(y)
    if x.shape[0] == y.shape[0]:
        
        x_2 = np.power(x,2)
        xy = np.multiply(x,y)
        x_sum = np.sum(x)
        y_sum = np.sum(y)        
        x_2_sum = np.sum(x_2)
        xy_sum = np.sum(xy)
        vector_space = x.shape[0]
        
        b1 = (vector_space * xy_sum - (x_sum * y_sum)) / (vector_space * x_2_sum - (x_sum)**2)
        b0 = (y_sum - (b1 * x_sum)) / vector_space
        return b0,b1

def predict(b0,b1,x):
    x = np.array(x)
    return b0 + (b1 * x)

In [None]:
x = [2,3,5,7,9]
y = [4,5,7,10,15]
x_test = [3,4,12,12,23]
b0,b1 = simpleLinearRegression_Fit(x,y)
predict(b0,b1,x_test)