In [66]:
import numpy as np
import  pandas as pd
import matplotlib.pyplot as plt

In [67]:
# Importing the dataset
d = pd.read_csv('regression_data1.csv')
print(d[:10])
x = d.iloc[:, :-1].values
y = d.iloc[:, -1].values

   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94
5  131876.90        99814.71        362861.36    New York  156991.12
6  134615.46       147198.87        127716.82  California  156122.51
7  130298.13       145530.06        323876.68     Florida  155752.60
8  120542.52       148718.95        311613.29    New York  152211.77
9  123334.88       108679.17        304981.62  California  149759.96


In [68]:
#one hot encoding of categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
x = np.array(ct.fit_transform(x))
print(x[:5])

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]]


In [69]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 1/3, random_state = 0)

In [70]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [71]:
def getInputs(x,y):
    #dims of theta : (no_of_variable+1,1)
    #dims of x : (m,no_of_variable)
    #dims of y : (m,1)
    #initilizing theta
    theta=np.zeros(shape=(x.shape[1]+1,1))
    #horizontal stacking of x with ones for evaluation of theta0
    ones=np.ones(shape=(x.shape[0],1))
    x=np.hstack((ones,x))
    #new dims of x : (m,no_of_variable+1)
    #reshaping y:
    y=y.reshape((y.shape[0],1))
    print("X Shape : ",x.shape)
    print("Y Shape : ",y.shape)
    print("Theta Shape : ",theta.shape)
    return x,y,theta

In [72]:
#hypothesis function
def h(x,theta):
    #h(x)=x.theta
    return np.dot(x,theta)

In [73]:
#gradient descent function
def gradientDescent(x,y,alpha=0.0):
    x,y,theta=getInputs(x,y)
    #taking 1000 iterations for convergence
    for i in range(1000):
        theta=theta-alpha*(1/x.shape[0])*np.sum((h(x,theta)-y)*x,axis=0).reshape((theta.shape[0],1))
    return(theta)

In [74]:
theta=gradientDescent(x_train,y_train,alpha=0.02)
print(theta)

X Shape :  (33, 7)
Y Shape :  (33, 1)
Theta Shape :  (7, 1)
[[108167.60496947]
 [  -536.21702607]
 [  1630.16902164]
 [  -649.78542513]
 [ 39711.69674944]
 [  1408.12655764]
 [  3316.37053057]]


In [75]:
def predict(x,theta):
    #horizontal stacking of x with ones for evaluation of theta0
    tx=x
    ones=np.ones(shape=(x.shape[0],1))
    x=np.hstack((ones,x))
    out=h(x,theta)
    return out

In [76]:
predict(x_test,theta)

array([[111416.70662755],
       [132466.33757397],
       [139625.94483241],
       [ 75912.47690705],
       [185821.7888661 ],
       [113383.06259343],
       [ 63243.90549304],
       [ 99112.64858715],
       [119275.46571795],
       [174947.86618649],
       [100802.21507717],
       [ 85935.9729772 ],
       [117298.96909273],
       [ 90658.67291799],
       [133142.32836248],
       [167189.05553124],
       [157619.25578866]])