# multiple linear regression with sklearn

In [1]:
#from sklearn.linear_model import LinearRegression
#from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
#import random
import pandas as pd



## Read the Computer Hardware Data Set

In [2]:
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data",
                  header=None)
data.columns=["vendor","Model","MYCT","MMIN", "MMAX","CACH","CHMIN",
              "CHMAX","PRP","ERP"]
data[:5]

Unnamed: 0,vendor,Model,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7b,29,8000,32000,32,8,32,172,253
4,amdahl,470v/7c,29,8000,16000,32,8,16,132,132


## Extract X and Y data and convert to arays

In [3]:
X=np.array(data[["MYCT","MMIN","MMAX","CACH","CHMIN","CHMAX"]])
Y=np.array(data["PRP"])
ndata=len(X)
print(ndata)

209


In [4]:
X[:3]

array([[  125,   256,  6000,   256,    16,   128],
       [   29,  8000, 32000,    32,     8,    32],
       [   29,  8000, 32000,    32,     8,    32]])

## divide into test and training sets

In [5]:
import random
indices=list(range(ndata))
random.shuffle(indices)
ntrain=round(.75*ndata)
training_indices=indices[:ntrain]
test_indices=indices[ntrain:]

In [6]:
XTRAIN=X[training_indices]
YTRAIN=Y[training_indices].reshape(-1,1)
XTEST=X[test_indices]
YTEST=Y[test_indices].reshape(-1,1)

## Display some of the data

In [7]:
print(XTRAIN[:3])
print(YTRAIN[:3])

[[  480   512  8000    32     0     0]
 [   23 16000 32000    64    16    32]
 [   26 16000 32000    64    16    24]]
[[ 67]
 [489]
 [465]]


## Multilinear regression with scikit learn

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [9]:
r=LinearRegression().fit(XTRAIN,YTRAIN)

In [10]:
r

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
print(r.intercept_)

[-55.79490706]


In [12]:
print(r.coef_)

[[ 0.04860612  0.0125405   0.00631709  0.70457547 -1.61978327  1.69010283]]


## Make a prediction with sklearn

In [13]:
P=r.predict(XTEST)

In [14]:
R2=r2_score(YTEST,P)
MSE = mean_squared_error(YTEST,P)
print(R2,MSE)

0.7975113911332077 5975.775754556711


## Define functions to calculate the Normal matrix

In [15]:
def Normal_Matrix(xdata, ydata):
    if xdata.ndim != 2:
        print("Not a 2x2 array")
        return(float("NAN"))
    nrows,ncols=xdata.shape
    column_ones=np.ones((nrows,1))
    A=np.hstack((column_ones, xdata))

    AT=A.T
    N=AT.dot(A)
    ATy = AT.dot(ydata)
    
    return(N,ATy)
N, ATY=Normal_Matrix(XTRAIN,YTRAIN)

In [16]:
import numpy.linalg

In [17]:
sol=numpy.linalg.solve(N,ATY)
sol

array([[-5.57949071e+01],
       [ 4.86061221e-02],
       [ 1.25405020e-02],
       [ 6.31709086e-03],
       [ 7.04575470e-01],
       [-1.61978327e+00],
       [ 1.69010283e+00]])

In [18]:
s=np.around(np.linalg.solve(N,ATY),3)
s

array([[-5.5795e+01],
       [ 4.9000e-02],
       [ 1.3000e-02],
       [ 6.0000e-03],
       [ 7.0500e-01],
       [-1.6200e+00],
       [ 1.6900e+00]])

In [19]:
def predicted_values(coefficients, xvalues):
    n=len(coefficients)
    nvectors, nfeatures=xvalues.shape
    if n!= nfeatures+1:
        print("The number of coefficients is",n,"the vectors have length",nfeatures)
        return(float("NAN"))
    output=[]
    for vector in xvalues:
        U=np.append([1],vector)
        y=float(U.dot(coefficients))
        output.append(y)
    output=np.array(output).reshape(-1,1)
    return (output)
PRE=predicted_values(sol,XTEST)

In [20]:
def MSS(YO, YP):
    n=len(YP)
    return(float(sum((YO-YP)**2)/n))
def TSS(YOBS):
    ybar = np.mean(YOBS)
    return(float(sum((ybar-YOBS)**2)))

In [21]:
MSS(YTEST, PRE)

5975.775754556709

In [22]:
TSS(YTEST)

1534606.5192307695

In [23]:
def MYR2(YOBS,YPRE):
    n=len(YOBS)
    rss=n*MSS(YOBS,YPRE)
    tss=TSS(YOBS)
    return(1-rss/tss)
MYR2(YTEST,PRE)

0.7975113911332077

In [24]:
n=len(YTEST)
numerator=sum((YTEST-PRE)**2)/n
ybar=np.mean(YTEST)
denom=sum((YTEST-ybar)**2)/n
1-numerator/denom

array([0.79751139])

In [25]:
len(YTEST)

52