In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from numpy.linalg import inv
from sklearn.metrics import mean_squared_error

In [2]:
dataset_file = 'BostonDataset.csv'

In [3]:
## load the dataset into a pandas dataframe
data_frame = pd.read_csv(dataset_file)  

In [4]:
data_frame.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,11.36,21.2
75%,3.677082,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,37.97,50.0


In [5]:
print(data_frame.columns)

Index(['CRIM', ' ZN ', 'INDUS ', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'LSTAT', 'MEDV'],
      dtype='object')


In [6]:
# convert dataframe to np array
data_array = data_frame.to_numpy()
print(data_array.shape)

(506, 13)


In [7]:
## define linear regression
def linear_regression_fit(X,Y):
    return inv(X.T@X)@X.T@Y

In [8]:
def linear_regression_with_bias_fit(X,Y):
    return (inv(X.T@X)@X.T@Y).T

In [9]:
# number of runs for each experiment (defined on the assignement paper)
N = 20
features_number = 13
test_split = round(((len(data_array)/3)/len(data_array)*100)/100,2)
print("Using " + str(test_split*100) + "% of the dataset dor testing and the rest " + str(round((1 - test_split),2)*100) + "% for the training")

Using 33.0% of the dataset dor testing and the rest 67.0% for the training


In [10]:
# (a) Predicting with the mean y-value on the training set
# question 1.2.a
for i in range(0,N):
    
    # split data on train and test set
    # suffle = True by default in this sklearn function
    train_data, test_data = train_test_split(data_array,test_size=test_split)

    if(i==0):
        ## initialize on the first round
        print("Initialization!")
        ## initialize x vectors with ones - fit the data with constant function
        x_train = np.asmatrix(np.ones(len(train_data))).T
        x_test = np.asmatrix(np.ones(len(test_data))).T
        y_train = train_data[:,(features_number-1)]
        y_test = test_data[:,(features_number-1)]
        mse_train = np.zeros(N)
        mse_test = np.zeros(N)
    
    w_train = linear_regression_fit(x_train,y_train)
    w_test = linear_regression_fit(x_test,y_test)
    
    mse_train[i]= mean_squared_error(x_train*w_train, y_train)
    mse_test[i]= mean_squared_error(x_test*w_train, y_test)
    
print("Average MSE for train and test set are: " + str(np.mean(mse_train)) + " and " + str(np.mean(mse_test)) + " respectively")

Initialization!
Average MSE for train and test set are: 77.03490275928681 and 99.71760814810197 respectively


In [11]:
## (b) Predicting with a single attribute and a bias term.
mse_train = np.zeros((N,(features_number-1)))
mse_test = np.zeros((N,(features_number-1)))

for i in range(0,N):
    
    train_data, test_data = train_test_split(data_array,test_size=test_split)
        
    for j in range(0,(features_number-1)):
        
      
        y_train = train_data[:,(features_number-1)]
        y_test = test_data[:,(features_number-1)]
        
        ## select one feature on every loop
        ## and one extra dimension with 1 values on the x
        x_train = np.asmatrix(np.vstack((train_data[:,j],np.ones(len(train_data))))).T
        x_test = np.asmatrix(np.vstack((test_data [:,j],np.ones(len(test_data))))).T
        
        w_train = linear_regression_with_bias_fit(x_train,y_train)
        w_test = linear_regression_with_bias_fit(x_test,y_test)
       
        mse_train[i,j]= mean_squared_error(x_train@w_train, y_train)
        mse_test[i,j]= mean_squared_error(x_test@w_test, y_test)
        
print("Linear Regression with single Features - Results")   
for i in range(len(mse_train.T)):
    print("Linear regression using attribute: " + str(data_frame.columns[i]) + " MSE train: " + str(np.mean((np.sum(mse_train,axis = 0)/N).T[i])) +" MSE test:" + str(np.mean((np.sum(mse_test,axis = 0 )/N).T[i])))

Linear Regression with single Features - Results
Linear regression using attribute: CRIM MSE train: 73.28153218461637 MSE test:65.65195838885228
Linear regression using attribute:  ZN  MSE train: 75.22689431904232 MSE test:68.96448136724034
Linear regression using attribute: INDUS  MSE train: 67.12326214391811 MSE test:59.01029698892465
Linear regression using attribute: CHAS MSE train: 84.00658810550773 MSE test:76.21965921795861
Linear regression using attribute: NOX MSE train: 71.74644738326911 MSE test:62.632150499587
Linear regression using attribute: RM MSE train: 44.91671265470207 MSE test:39.72961122113274
Linear regression using attribute: AGE MSE train: 75.4777590707787 MSE test:65.28953503879058
Linear regression using attribute: DIS MSE train: 82.05386912259577 MSE test:72.33358761183362
Linear regression using attribute: RAD MSE train: 74.34580712493519 MSE test:66.81485613779624
Linear regression using attribute: TAX MSE train: 68.27657730932647 MSE test:60.1512311019966


In [12]:
## Predicting with all the attributes
for i in range(N):
    
    mse_train = np.zeros(N)
    mse_test  = np.zeros(N)
    
    train_data, test_data = train_test_split(data_array,test_size=test_split)
     
    ## use all the features
    y_train = train_data[:,(features_number-1)]
    y_test = test_data[:,(features_number-1)]
    
    ## add one dimension with one to x data
    x_train = np.asmatrix(np.c_[(train_data[:,range(features_number)],np.ones(len(train_data)))])
    x_test = np.asmatrix(np.c_[(test_data[:,range(features_number)] ,np.ones(len(test_data )))])
        
    w_train = linear_regression_with_bias_fit(x_train,y_train)
    w_test = linear_regression_with_bias_fit(x_test,y_test)
        
    mse_train[i]= mean_squared_error(x_train@w_train, y_train)
    mse_test [i]= mean_squared_error(x_test@w_test, y_test)
        
print("Linear regression using all the features with MSE train: " + str(np.sum(mse_train)/N) + " MSE test: " +  str(np.sum(mse_test )/N))

Linear regression using all the features with MSE train: 1.4084329525873831e-24 MSE test: 5.171647149926117e-25
