In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures

## Data

In [17]:
fish_data = pd.read_csv('./data/Fish.csv')

fish_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Species  159 non-null    object 
 1   Weight   159 non-null    float64
 2   Length1  159 non-null    float64
 3   Length2  159 non-null    float64
 4   Length3  159 non-null    float64
 5   Height   159 non-null    float64
 6   Width    159 non-null    float64
dtypes: float64(6), object(1)
memory usage: 8.8+ KB


In [18]:
index = fish_data[ fish_data['Weight'] == 0.0].index
fish_data.drop(index, inplace = True)

fish_data.describe()

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width
count,158.0,158.0,158.0,158.0,158.0,158.0
mean,400.847468,26.293038,28.465823,31.28038,8.98679,4.424232
std,357.697796,10.011427,10.731707,11.627605,4.295191,1.68901
min,5.9,7.5,8.4,8.8,1.7284,1.0476
25%,121.25,19.15,21.0,23.2,5.9406,3.39865
50%,281.5,25.3,27.4,29.7,7.789,4.27705
75%,650.0,32.7,35.75,39.675,12.37185,5.58675
max,1650.0,59.0,63.4,68.0,18.957,8.142


In [19]:
# Separate Target from features
y_fish = fish_data.iloc[:, 1]
X_fish = fish_data.drop(columns=['Weight'])
X_fish = X_fish.drop(columns=['Species'])

# Convert into numpy arrays
X_fish = X_fish.values
y_fish = y_fish.values

# Split into Training and Test Groups
X_fish_train, X_fish_test, y_fish_train, y_fish_test = train_test_split(X_fish, y_fish, test_size = 0.2, random_state = 0)

## Training

In [20]:
regressor = make_pipeline(PolynomialFeatures(3), Ridge(alpha = 0))

regressor.fit(X_fish_train, y_fish_train)

## Evaluation

In [21]:
y_fish_pol_pred = regressor.predict(X_fish_test)

pred_pol_compare = pd.DataFrame()
pred_pol_compare['Prediction'] = y_fish_pol_pred.tolist()
pred_pol_compare['Actual'] = y_fish_test.tolist()

print(pred_pol_compare)

     Prediction  Actual
0    352.133789   390.0
1    180.712891   160.0
2    727.618408   700.0
3    904.970459  1015.0
4    115.989014   120.0
5   1025.142334  1100.0
6    873.562256   820.0
7   1043.728760   950.0
8    613.486572   556.0
9    136.389160   145.0
10   695.691406   700.0
11  1546.969238  1600.0
12   719.538330   720.0
13    32.837891    55.0
14    60.150146    85.0
15   195.705566   188.0
16   274.623779   300.0
17   182.480957   180.0
18  1546.969238  1550.0
19   314.674316   306.0
20   138.439697   140.0
21   986.180664   975.0
22   890.231201  1000.0
23   473.557373   450.0
24   109.229980   110.0
25    67.157471    78.0
26   322.981445   300.0
27   550.262451   650.0
28     6.771973     6.7
29   457.012451   514.0
30   247.938232   290.0
31   253.839600   270.0


In [22]:
mse_pol = mean_squared_error(y_fish_test, y_fish_pol_pred)
r2s_pol = r2_score(y_fish_test, y_fish_pol_pred)

print("MSE: %s | R2S: %s" % (mse_pol, r2s_pol))

MSE: 2147.555587427765 | R2S: 0.9880593007953454
