## Multivariable Linear Regression using Gradient Descent

In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [73]:
df = pd.read_table('http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data', names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name'], delim_whitespace=True)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [74]:
df.drop(labels = 'car name', axis = 1, inplace=True)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [75]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
model year        int64
origin            int64
dtype: object

In [76]:
# for col in df.columns:
#     if df[col] == '?':
#         print(df[col])
df.horsepower.replace('?', '150.0', inplace=True)
df.horsepower = df.horsepower.astype(float)

In [77]:
Y = df.mpg
X = df.iloc[: ,1: ]
X.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,8,307.0,130.0,3504.0,12.0,70,1
1,8,350.0,165.0,3693.0,11.5,70,1
2,8,318.0,150.0,3436.0,11.0,70,1
3,8,304.0,150.0,3433.0,12.0,70,1
4,8,302.0,140.0,3449.0,10.5,70,1


In [78]:
X.shape

(398, 7)

## First we will use Scikit learn

In [79]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
Y_test = np.array(Y_test).reshape(-1, 1)
Y_train = np.array(Y_train).reshape(-1, 1)
print(X_train.shape)
print(X_test.shape)
print(Y_test.shape)
print(Y_train.shape)

(298, 7)
(100, 7)
(100, 1)
(298, 1)


In [80]:
from sklearn.linear_model import LinearRegression
algo = LinearRegression()
algo.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [135]:
Y_predict = algo.predict(X_test)
from sklearn.metrics import mean_squared_error
print(mean_squared_error(Y_test, Y_predict))
print(algo.coef_)

12.098872440063541
[[-0.60322502  0.02132261  0.0018818  -0.00722254  0.22364982  0.73075729
   1.16192903]]


In [82]:
print(X_train.shape)

(298, 7)


## Now we will create our own algo

### Batch Gradient Descent

In [151]:
def gradient_descent(X, Y):
    epochs = 1000
    learning_rate = 0.01
    m = np.zeros((1,8), dtype=float)
    for i in range(epochs):
        m = step_gd(m, learning_rate, X, Y)
        if i % 100 == 0:
            print(i // 100, 'Cost:', cost(m, X, Y))
    return m

In [152]:
def step_gd(m, lrate, x, y):
    N = len(x)
    J = len(x[0]) ## no of features
    m_slope = np.zeros(m.shape)
    for i in range(N):
        Y = y[i]
        X = x[i,0:J]
        mx = np.sum(m * X)
        m_slope += lrate *(-2/N) * (Y - mx) * X
    new_m = m - (m_slope)
    return new_m    

In [153]:
def cost(m, x, y):
    y_hat = m * x
    N = y.shape[0]
    total_cost = 0.0
    for i in range(N):
        total_cost += (1/N) * ((y[i] - np.sum(y_hat[i])) ** 2)
    return total_cost

In [154]:
def run():
    X = np.array(X_train)
    row = X.shape[0]
    col = X.shape[1]
    for i in range(row):
        for j in range(col):
            X[i][j] = (X[i][j] - np.mean(X[:, j])) / np.std(X[:, j]) 
    x = np.insert(X,7, 1.0, axis=1) ##inserting 1 at 7th column and every row 
    y = Y_train
    m = gradient_descent(x, y)
    return m

In [155]:
m = run()
print(m)

0 Cost: [449.72999451]
1 Cost: [57.28617814]
2 Cost: [33.74908407]
3 Cost: [25.57114929]
4 Cost: [22.08550239]
5 Cost: [20.44853563]
6 Cost: [19.59718788]
7 Cost: [19.10218743]
8 Cost: [18.78318347]
9 Cost: [18.5604951]
[[-2.36692722  0.61267826 -1.18995059 -4.4486421  -0.40777163  6.08678018
   1.58179141 23.30646987]]


In [156]:
x = np.insert(np.array(X_test),7, 1.0, axis=1)
mx = m * x
y_predict2 = np.ones(Y_test.shape)
for i in range(len(mx)):
    y_predict2[i] = np.sum(mx[i])

In [157]:
mean_squared_error(y_predict2, Y_test)

183157042.3441153

#### This model overfits the training data, after Cleaning the Data error will decrease