In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [122]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import SCORERS
from sklearn.metrics import r2_score

In [76]:
df = pd.read_csv("abalone.csv")

In [77]:
df.head(2)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7


In [78]:
df.Rings.unique()

array([15,  7,  9, 10,  8, 20, 16, 19, 14, 11, 12, 18, 13,  5,  4,  6, 21,
       17, 22,  1,  3, 26, 23, 29,  2, 27, 25, 24])

In [79]:
X = df.drop(columns = 'Rings')

In [80]:
df['age'] = df['Rings']+1.5
y = df['age']

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [None]:
A=πab

In [111]:
import math

class AreaTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_["area"] = math.pi * X_['Length'] * X_['Diameter']
        return X_[["area"]]

In [112]:
trans = ColumnTransformer(
    [('ohe', OneHotEncoder(drop = 'first'), ['Sex']),
    ('area', AreaTransformer(), ['Length', 'Diameter'])],
    remainder = 'passthrough')

In [113]:
preproc = Pipeline([
    ('encoder', trans),
    ('scaler', StandardScaler())
])

In [114]:
lin_reg = Pipeline([
    ('preproc', preproc),
    ('model', LinearRegression())
])

In [115]:
gbr = Pipeline([
    ('preproc', preproc),
    ('gbr', GradientBoostingRegressor(n_estimators=500, 
                               learning_rate=0.2,
                               max_depth=1))
])

In [116]:
rf = Pipeline([
    ('preproc', preproc),
    ('rf', RandomForestRegressor(n_estimators = 400))  
])


In [118]:
res = cross_validate(lin_reg, 
               X_train, 
               y_train, 
               cv = 5, 
               scoring = ['neg_mean_absolute_error','neg_mean_absolute_percentage_error', 'r2'])

print('abs error: ', res['test_neg_mean_absolute_error'].mean())
print('% error: ', res['test_neg_mean_absolute_percentage_error'].mean())
print('r2: ', res['test_r2'].mean())

abs error:  -1.6110005746171052
% error:  -0.14085498307289604
r2:  0.49347782903178816


In [120]:
res = cross_validate(gbr, 
               X_train, 
               y_train, 
               cv = 5, 
               scoring = ['neg_mean_absolute_error','neg_mean_absolute_percentage_error', 'r2'])

print('abs error: ', res['test_neg_mean_absolute_error'].mean())
print('% error: ', res['test_neg_mean_absolute_percentage_error'].mean())
print('r2: ', res['test_r2'].mean())

abs error:  -1.5911960828471206
% error:  -0.1364910742325986
r2:  0.530506721386295


In [121]:
res = cross_validate(rf, 
               X_train, 
               y_train, 
               cv = 5, 
               scoring = ['neg_mean_absolute_error','neg_mean_absolute_percentage_error', 'r2'])

print('abs error: ', res['test_neg_mean_absolute_error'].mean())
print('% error: ', res['test_neg_mean_absolute_percentage_error'].mean())
print('r2: ', res['test_r2'].mean())

abs error:  -1.5603701220583068
% error:  -0.13366143834958985
r2:  0.5365793019802574


In [98]:
rf.get_params()

{'memory': None,
 'steps': [('preproc', Pipeline(steps=[('encoder',
                    ColumnTransformer(remainder='passthrough',
                                      transformers=[('ohe',
                                                     OneHotEncoder(drop='first'),
                                                     ['Sex'])])),
                   ('scaler', StandardScaler())])),
  ('rf', RandomForestRegressor(n_estimators=400))],
 'verbose': False,
 'preproc': Pipeline(steps=[('encoder',
                  ColumnTransformer(remainder='passthrough',
                                    transformers=[('ohe',
                                                   OneHotEncoder(drop='first'),
                                                   ['Sex'])])),
                 ('scaler', StandardScaler())]),
 'rf': RandomForestRegressor(n_estimators=400),
 'preproc__memory': None,
 'preproc__steps': [('encoder',
   ColumnTransformer(remainder='passthrough',
                     transformers=

In [140]:
params = {'rf__max_depth': [1, 2, 3, 4, 5, 6, 7, 8],
         'rf__n_estimators': [200, 400]}

In [141]:
grid = GridSearchCV(rf, params, verbose = 3)

In [138]:
gs_res = grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END rf__max_depth=1, rf__min_samples_split=2, rf__n_estimators=200; total time=   0.4s
[CV] END rf__max_depth=1, rf__min_samples_split=2, rf__n_estimators=200; total time=   0.4s
[CV] END rf__max_depth=1, rf__min_samples_split=2, rf__n_estimators=200; total time=   0.4s
[CV] END rf__max_depth=1, rf__min_samples_split=2, rf__n_estimators=200; total time=   0.4s
[CV] END rf__max_depth=1, rf__min_samples_split=2, rf__n_estimators=200; total time=   0.4s
[CV] END rf__max_depth=1, rf__min_samples_split=5, rf__n_estimators=200; total time=   0.4s
[CV] END rf__max_depth=1, rf__min_samples_split=5, rf__n_estimators=200; total time=   0.4s
[CV] END rf__max_depth=1, rf__min_samples_split=5, rf__n_estimators=200; total time=   0.4s
[CV] END rf__max_depth=1, rf__min_samples_split=5, rf__n_estimators=200; total time=   0.4s
[CV] END rf__max_depth=1, rf__min_samples_split=5, rf__n_estimators=200; total time=   0.4s
[CV] END rf__max_de

In [139]:
gs_res.best_estimator_

Pipeline(steps=[('preproc',
                 Pipeline(steps=[('encoder',
                                  ColumnTransformer(remainder='passthrough',
                                                    transformers=[('ohe',
                                                                   OneHotEncoder(drop='first'),
                                                                   ['Sex']),
                                                                  ('area',
                                                                   AreaTransformer(),
                                                                   ['Length',
                                                                    'Diameter'])])),
                                 ('scaler', StandardScaler())])),
                ('rf', RandomForestRegressor(max_depth=4, n_estimators=200))])

In [57]:
preproc.fit_transform(X_train)

array([[1.        , 0.        , 0.6       , ..., 0.21222222, 0.20210665,
        0.20129547],
       [0.        , 1.        , 0.75172414, ..., 0.33555556, 0.25806452,
        0.3472845 ],
       [1.        , 0.        , 0.54482759, ..., 0.12888889, 0.11191573,
        0.13303438],
       ...,
       [1.        , 0.        , 0.71724138, ..., 0.35962963, 0.24160632,
        0.27304434],
       [0.        , 1.        , 0.89655172, ..., 0.51592593, 0.62146149,
        0.57648231],
       [0.        , 1.        , 0.68275862, ..., 0.35296296, 0.30414747,
        0.26756353]])