In [129]:
import pandas as pd
import numpy as np

In [130]:
from ucimlrepo import fetch_ucirepo 
abalone = fetch_ucirepo('abalone') 
  
X = abalone.data.features 
y = abalone.data.targets['Rings']

In [131]:
print(abalone.variables) 

             name     role         type demographic  \
0             Sex  Feature  Categorical        None   
1          Length  Feature   Continuous        None   
2        Diameter  Feature   Continuous        None   
3          Height  Feature   Continuous        None   
4    Whole_weight  Feature   Continuous        None   
5  Shucked_weight  Feature   Continuous        None   
6  Viscera_weight  Feature   Continuous        None   
7    Shell_weight  Feature   Continuous        None   
8           Rings   Target      Integer        None   

                   description  units missing_values  
0         M, F, and I (infant)   None             no  
1    Longest shell measurement     mm             no  
2      perpendicular to length     mm             no  
3           with meat in shell     mm             no  
4                whole abalone  grams             no  
5               weight of meat  grams             no  
6  gut weight (after bleeding)  grams             no  
7        

In [132]:
X = pd.get_dummies(X, columns=['Sex'], dtype=int)

In [133]:
X.corrwith(y).sort_values(ascending=False)

Shell_weight      0.627574
Diameter          0.574660
Height            0.557467
Length            0.556720
Whole_weight      0.540390
Viscera_weight    0.503819
Shucked_weight    0.420884
Sex_F             0.250279
Sex_M             0.181831
Sex_I            -0.436063
dtype: float64

In [134]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=42)

In [135]:
X_transpose = X_train.T

beta = np.linalg.inv(X_transpose.dot(X_train)).dot(X_transpose).dot(y_train)
beta

array([  0.47373389,   9.02758549,   6.5192326 ,   9.30706093,
       -21.20708955,  -8.11540232,   9.90946874,   4.42792244,
         3.62973126,   4.40715424])

In [136]:
def calculate_rmse(beta, X_test, y_test):
    total_error = 0
    
    for val in range(len(X_test)):
        predicted = np.dot(beta, X_test.iloc[val])
        difference = y_test.iloc[val]-predicted
        total_error+= difference*difference

    
    return (np.sqrt(total_error/len(X_test)))

In [137]:
calculate_rmse(beta, X_test=X_test, y_test=y_test)

2.237161211882194

In [139]:
from sklearn.decomposition import PCA

cumulative = []

for pc in range(len(X.columns)):
    pca = PCA(n_components=pc)
    pca.fit_transform(X)
    cumulative.append(np.sum(pca.explained_variance_ratio_))

cumulative = np.array(cumulative)

In [123]:
no_components = np.where(cumulative>=0.95)[0][0]
pca = PCA(n_components=no_components)
pca_fit = pca.fit_transform(X)
X_pca = pd.DataFrame(pca_fit)

In [124]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.6, random_state=42)

In [125]:
X_transpose = X_train.T

beta = np.linalg.inv(X_transpose.dot(X_train)).dot(X_transpose).dot(y_train)
beta

array([2.83742893, 0.31653123, 1.11039295])

In [126]:
calculate_rmse(beta, X_test=X_test, y_test=y_test)

10.269666646572894