In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

In [16]:
abalone = pd.read_csv("../resource/asnlib/publicdata/abalone.csv")

from sklearn.preprocessing import StandardScaler

# cutting out rings and sex from features, choosing rings as ouput
features = ['Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight']

X = abalone[
    ['Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight']
    ]
Y = abalone[['Rings']]

# turning table into matrices
X = X.to_numpy()
Y = Y.to_numpy()

# standardizing features
scaler = StandardScaler()
X_stand = scaler.fit_transform(X)

# adding constant
ones = np.ones((X_stand.shape[0], 1))
X_stand = np.hstack([X_stand, ones])

# perform linear regression with cross-validation
kf = KFold(n_splits=5, shuffle=False)
mse = 0
for i, (train_index, test_index) in enumerate(kf.split(X_stand)):
    # split X_stand and Y into testing and training data
    X_train = X_stand[train_index]
    X_test  = X_stand[test_index]

    y_train = Y[train_index]
    y_test = Y[test_index]
    
    # train the model
    w = np.dot((np.dot(np.linalg.inv(np.dot(X_train.T,X_train)),X_train.T)),y_train)

    # generate predictions for the test set
    yhat = X_test.dot(w)
    
    # compute RMSE
    error = y_test - yhat
    error_sq = error * error
    error_sq_mu = error_sq.mean()
    
    mse += error_sq_mu # adding up mse for total rmse

# computing overall rmse: square root sum of all mse, then divided by # of mse's (average rmse)
ols_error = np.sqrt(mse / 5)

# PRINTING RMSE
print(ols_error)

2.331168516746062


In [17]:
from sklearn.preprocessing import OneHotEncoder

abalone = pd.read_csv("../resource/asnlib/publicdata/abalone.csv")

# one hot encoding for Sex
abalone['Sex_I'] = (abalone['Sex'] == 'I').astype(int)
abalone['Sex_F'] = (abalone['Sex'] == 'F').astype(int)

abalone = abalone.drop(['Sex'], axis=1)

# choosing features
X = abalone[
    ['Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight']
    ]
Y = abalone[['Rings']]
# one-hot encoding without Sex_M
onehot = abalone[['Sex_F', 'Sex_I']]

# turning features into matrices
X = X.to_numpy()
Y = Y.to_numpy()
onehot = onehot.to_numpy()

# standardizing features (exclduing Sex)
scaler = StandardScaler()
X_stand = scaler.fit_transform(X)

# add one hot encoding for Sex to X_stand matrix
X_stand = np.hstack([X_stand, onehot])

# adding constant
ones = np.ones((X_stand.shape[0], 1))
X_stand = np.hstack([X_stand, ones])

# performing linear regression with cross validation
kf = KFold(n_splits=5, shuffle=False)
mse = 0
for i, (train_index, test_index) in enumerate(kf.split(X_stand)):
    # split X_stand and Y into testing and training data
    X_train = X_stand[train_index]
    X_test  = X_stand[test_index]

    y_train = Y[train_index]
    y_test = Y[test_index]
    
    # train the modelx
    w = np.dot((np.dot(np.linalg.inv(np.dot(X_train.T,X_train)),X_train.T)),y_train)

    # generate predictions for the test set
    yhat = X_test.dot(w)
    
    # compute RMSE
    error = y_test - yhat
    error_sq = error * error
    error_sq_mu = error_sq.mean()
    
    mse += error_sq_mu # adding up mse for total rmse

# computing overall rmse: square root sum of all mse, then divided by # of mse's (average rmse)
onehot_error = np.sqrt(mse / 5)

# PRINTING RMSE
print(onehot_error)


2.295563430253222


In [6]:
# we evaulate `onehot_error` here

###
### AUTOGRADER TEST - DO NOT REMOVE
###


In [18]:
# PART D. in this cell, your RMSE over all points should be stored in a variable called `ridge_error`

lambda_weight = 1e2

###
### YOUR CODE HERE
###

abalone = pd.read_csv("../resource/asnlib/publicdata/abalone.csv")

# one hot encoding for Sex
abalone['Sex_M'] = (abalone['Sex'] == 'M').astype(int)
abalone['Sex_F'] = (abalone['Sex'] == 'F').astype(int)
abalone['Sex_I'] = (abalone['Sex'] == 'I').astype(int)

abalone = abalone.drop(['Sex'], axis=1)

# choosing features
X = abalone[
    ['Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight']
    ]
Y = abalone[['Rings']]
# one-hot encoding for Sex
onehot = abalone[['Sex_M', 'Sex_F', 'Sex_I']]

# turning features into matrices
X = X.to_numpy()
Y = Y.to_numpy()
onehot = onehot.to_numpy()

# preparing identity matrix I for ridge regression
I = np.identity(11)

# standardizing features (exclduing Sex)
scaler = StandardScaler()
X_stand = scaler.fit_transform(X)

# add one hot encoding for Sex to X_stand matrix
X_stand = np.hstack([X_stand, onehot])

# adding constant
ones = np.ones((X_stand.shape[0], 1))
X_stand = np.hstack([X_stand, ones])

# performing linear regression with cross validation
kf = KFold(n_splits=5, shuffle=False)
mse = 0
for i, (train_index, test_index) in enumerate(kf.split(X_stand)):
    # split X_stand and Y into testing and training data
    X_train = X_stand[train_index]
    X_test  = X_stand[test_index]

    y_train = Y[train_index]
    y_test = Y[test_index]
    
    # train the modelx
    w = np.dot((np.dot(np.linalg.inv(np.dot(X_train.T,X_train) + np.dot(I,lambda_weight)),X_train.T)),y_train)

    # generate predictions for the test set
    yhat = X_test.dot(w)
    
    # compute RMSE
    error = y_test - yhat
    error_sq = error * error
    error_sq_mu = error_sq.mean()
    
    mse += error_sq_mu # adding up mse for total rmse

# computing overall rmse: square root sum of all mse, then divided by # of mse's (average rmse)
ridge_error = np.sqrt(mse / 5)

# PRINTING RMSE
print(ridge_error)

2.3597584332401618


In [53]:
# we evaluate `ridge_error` here

###
### AUTOGRADER TEST - DO NOT REMOVE
###
