In [130]:
import pandas as pd
import numpy as np
from sklearn import (linear_model,
                     metrics,
                     preprocessing,
                     model_selection)
import matplotlib.pyplot as plt
import seaborn as sns

In [131]:
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
boston_data = pd.read_csv('data/housing.csv', header=None, delimiter=r"\s+", names=column_names)
boston_data.head()
df = boston_data.copy()
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [132]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [133]:
# ones = np.ones((boston_data.shape[0], 1))
# df = pd.concat([pd.DataFrame(ones), boston_data], axis=1).rename(columns={0: 'ONES'})
# df.head()

In [134]:
X = df.drop('MEDV', axis=1)
y = df['MEDV']

In [135]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=.2, random_state=13)

In [136]:
lin_mod = linear_model.LinearRegression()
lin_mod.fit(X_train, y_train)
y_train_pred = lin_mod.predict(X_train)
y_test_pred = lin_mod.predict(X_test)

print('MAE test : {:.3f}'.format(metrics.mean_absolute_error(y_test, y_test_pred)))
print('MAE train : {:.3f}'.format(metrics.mean_absolute_error(y_train, y_train_pred)))
print('MAPE test : {:.3f}'.format(metrics.mean_absolute_percentage_error(y_test, y_test_pred) * 100))
print('MAPE train : {:.3f}'.format(metrics.mean_absolute_percentage_error(y_train, y_train_pred) * 100))
print('MSE test: {:.3f}'.format(metrics.mean_squared_error(y_test, y_test_pred)))
print('MSE train: {:.3f}'.format(metrics.mean_squared_error(y_train, y_train_pred)))
print('R2 test: {:.3f}'.format(metrics.r2_score(y_test, y_test_pred)))
print('R2 train: {:.3f}'.format(metrics.r2_score(y_train, y_train_pred)))

MAE test : 3.627
MAE train : 3.224
MAPE test : 20.681
MAPE train : 15.533
MSE test: 24.318
MSE train: 21.556
R2 test: 0.732
R2 train: 0.739


In [137]:
y_test_pred

array([11.23728132, 19.65693542, 20.77279453, 30.01783844, 23.34685884,
       22.27047398, 21.84486005, 20.61260546, 19.29666529,  0.36793865,
       18.92138754, 18.959503  , 36.31015707, 10.7659897 , 15.33471911,
       18.80208983, 17.08008653, 20.87257606, 30.52648473, 21.66712266,
       36.16920572, 30.49725218, 22.87941979, 14.33087637, 18.34617256,
        6.67852144, 34.83033225, 16.82964023, 14.40608322, 19.66712403,
       29.10945203,  5.01376207, -5.40629161, 20.61394839, 20.68508527,
       14.9745791 , 19.50390664, 16.88251618, 34.65145054, 16.60728312,
       16.37781386, 20.78072572, 36.34448399, 12.53464158, 15.65838997,
       35.71307371, 14.72150456,  3.23413117, 21.60588655, 20.53810441,
       35.59894025, 10.52440061, 39.7987333 , 17.13435228, 22.09022132,
       20.00130664, 20.78652117, 32.47699491, 32.93750073, 23.42028846,
       21.89455379, 13.06687124, 21.17908961, 20.21627944, 22.28007322,
       30.16539802, 27.71540332, 16.52474502, 17.85753808, 20.41

In [138]:
print(list(np.round(lin_mod.coef_, 2)))
print(lin_mod.intercept_)

[-0.15, 0.06, -0.01, 2.49, -14.96, 3.54, 0.0, -1.59, 0.37, -0.01, -0.89, 0.01, -0.56]
37.64291482674966


In [139]:
def l_m(X, y):
    pass

In [140]:
X_train = np.column_stack([np.ones(X_train.shape[0]), X_train])
w = np.linalg.inv(X_train.T@X_train)@X_train.T@y_train

In [141]:
print(list(np.round(w[1:], 2)))

[-0.15, 0.06, -0.01, 2.49, -14.96, 3.54, 0.0, -1.59, 0.37, -0.01, -0.89, 0.01, -0.56]


In [142]:
w[0] + X_test@w[1:]

477    11.237281
451    19.656935
29     20.772795
0      30.017838
312    23.346859
         ...    
376    17.322696
90     27.076506
78     20.429818
44     22.573933
350    20.039099
Length: 102, dtype: float64