In [1]:
# Standard includes
%matplotlib inline
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
# Routines for linear regression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
# Set label size for plots
matplotlib.rc('xtick', labelsize=14) 
matplotlib.rc('ytick', labelsize=14)

In [2]:
data = np.genfromtxt('diabetes-data.csv', delimiter=',')
features = ['age', 'sex', 'body mass index', 'blood pressure', 
            'serum1', 'serum2', 'serum3', 'serum4', 'serum5', 'serum6']
x = data[:,0:10] # predictors
y = data[:,10] # response variable


In [3]:
def split_data(X,Y, tr_no):
    indices =  np.random.permutation(len(X))
    
    train_idx =  indices[0:tr_no]
    test_idx = indices[tr_no:]
    
    train_X = X[train_idx]
    train_Y = Y[train_idx]
    
    test_X = X[test_idx]
    test_Y = Y[test_idx]
    
    
    
    return train_X, train_Y, test_X, test_Y
    





In [9]:
train_X, train_Y, test_X, test_Y = split_data(x, y, 400)

In [10]:
test_Y

array([220.,  75., 177., 275., 141.,  88., 248., 146., 233., 281., 155.,
       109., 220.,  63.,  90.,  70.,  25., 341.,  52.,  61.,  52., 113.,
       150., 225., 259.,  91., 303., 219.,  80., 122., 288.,  64., 257.,
       164.,  72., 122., 174., 160., 248., 185.,  64., 241.])

In [5]:
theta =  np.linalg.inv(train_X.T @ train_X) @train_X.T @train_Y

In [6]:
theta

array([ 2.22964299e-02, -2.60727886e+01,  5.35372592e+00,  1.01779705e+00,
        1.26358591e+00, -1.28493621e+00, -3.06827817e+00, -5.50804168e+00,
        5.50338146e+00,  1.23385180e-01])

In [11]:
test_Y

array([220.,  75., 177., 275., 141.,  88., 248., 146., 233., 281., 155.,
       109., 220.,  63.,  90.,  70.,  25., 341.,  52.,  61.,  52., 113.,
       150., 225., 259.,  91., 303., 219.,  80., 122., 288.,  64., 257.,
       164.,  72., 122., 174., 160., 248., 185.,  64., 241.])

In [13]:
pred = np.around(test_X @ theta, decimals = 0)

In [14]:
pred

array([192.,  76., 122., 199., 173.,  97., 200., 150., 245., 233., 215.,
       207., 209.,  66.,  77., 184., 149., 241.,  75., 118., 204., 109.,
       141., 222., 152., 166., 267., 136.,  96., 169., 213.,  97., 178.,
       186.,  96., 184., 173., 121., 210., 154., 105., 211.])

In [15]:
mse =  np.sum((test_Y - pred)**2)/len(test_Y)

In [16]:
mse

3486.0238095238096

In [22]:
n_train = np.arange(50, 400,50)

In [24]:


for n in n_train:
    train_X, train_Y, test_X, test_Y = split_data(x, y, n)
    theta =  np.linalg.inv(train_X.T @ train_X) @train_X.T @train_Y
    pred = np.around(test_X @ theta, decimals = 0)
    mse =  np.sum((test_Y - pred)**2)/len(test_Y)
    
    print(mse)
    

4342.752551020408
3529.751461988304
3201.9794520547944
3579.061983471074
3066.515625
2602.5704225352115
2905.2934782608695


In [23]:
n_train

array([ 50, 100, 150, 200, 250, 300, 350])