#Linear Regression in Python

Let us do regression in the semi-manual way first

In [1]:
from numpy import array, dot, newaxis
from scipy import linalg

In [6]:
X = array([[1,1], [1,2], [1,3], [1,4]])
Y = array([[1], [2], [3], [4]])

In [7]:
n = linalg.inv(dot(X.T, X))
k = dot(X.T, Y)
coef_ = dot(n,k)

In [9]:
print coef_

[[ -3.55271368e-15]
 [  1.00000000e+00]]


##Let us create our own first machine learning implementation

In [11]:
def myregression(input, response):
    return dot(linalg.inv(dot(input.T, input)), dot(input.T, response))

In [12]:
myregression(X,Y)

array([[ -3.55271368e-15],
       [  1.00000000e+00]])

##Ordinary Least Squares (OLS) Estimation

We are going to use a package called StatsModels

In [39]:
from __future__ import division, print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
np.set_printoptions(precision=3)

Let us create some data

In [23]:
nsample = 100
x = np.linspace(0, 10, 100)

X = np.column_stack((x, x**2))
beta = np.array([1, 0.1, 10])
e = np.random.normal(size = nsample)

Let us now add a column of 1s to add an intercept to our model

In [24]:
X = sm.add_constant(X)
y = np.dot(X, beta) + e

In [35]:
model = sm.OLS(y, X)
model

<statsmodels.regression.linear_model.OLS at 0x10b5be590>

In [36]:
result = model.fit()

In [38]:
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 3.438e+06
Date:                Thu, 21 May 2015   Prob (F-statistic):          5.61e-236
Time:                        20:31:50   Log-Likelihood:                -154.36
No. Observations:                 100   AIC:                             314.7
Df Residuals:                      97   BIC:                             322.5
Df Model:                           2                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          0.9759      0.338      2.885      0.005         0.305     1.647
x1             0.1409      0.156      0.901      0.3

In [62]:
mammals = pd.read_csv("mammals.csv")





In [64]:
body = np.array([[x] for x in mammals['body'].values])

In [65]:
body.shape

(62, 1)

In [66]:
body = mammals['body'][:, np.newaxis].astype(float)

In [67]:
brain = mammals['brain'].values

In [72]:
brain.shape

(62,)

In [73]:
body = sm.add_constant(body)

In [74]:
regression1 = sm.OLS(brain, body)
results1 = regression1.fit()

In [75]:
print(results1.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.873
Model:                            OLS   Adj. R-squared:                  0.871
Method:                 Least Squares   F-statistic:                     411.2
Date:                Thu, 21 May 2015   Prob (F-statistic):           1.54e-28
Time:                        20:56:29   Log-Likelihood:                -447.38
No. Observations:                  62   AIC:                             898.8
Df Residuals:                      60   BIC:                             903.0
Df Model:                           1                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const         91.0044     43.553      2.090      0.041         3.886   178.123
x1             0.9665      0.048     20.278      0.0