In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

%config InlineBackend.figure_format = 'svg'

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly
import plotly.graph_objs as go

init_notebook_mode(connected=True)

In [2]:
df = pd.read_csv('../data/abalone.csv.gz', compression='gzip')

In [4]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [5]:
df['Sex'] = df['Sex'].map({'M': 1, 'I': 0, 'F': -1, })

In [6]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,-1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [7]:
df.columns

Index(['Sex', 'Length', 'Diameter', 'Height', 'WholeWeight', 'ShuckedWeight',
       'VisceraWeight', 'ShellWeight', 'Rings'],
      dtype='object')

In [8]:
data_columns = ['Sex', 'Length', 'Diameter', 'Height', 'WholeWeight', 'ShuckedWeight',
       'VisceraWeight', 'ShellWeight']

In [9]:
X = df[data_columns]
y = df['Rings']

In [10]:
from sklearn.ensemble import RandomForestRegressor

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

In [36]:
def get_tree_number(X, y, N: int=50, threshold_value: float=0.52):
    result = []
    
    for i in range(1, N+1):
        rfr = RandomForestRegressor(random_state=1, n_estimators=i, n_jobs=-1)
        cv_folder = KFold(n_splits=5, shuffle=True, random_state=1)    
        
        score = np.mean( cross_val_score(rfr, X, y, cv=cv_folder, scoring='r2') )
        result.append((i, score, ))
    
    x_plot = [it[0] for it in result]
    y_plot = [it[1] for it in result]
    
    trace = go.Scatter(x=x_plot, y=y_plot, name='CV r2 score')
    threshold = go.Scatter(x=x_plot, y=[threshold_value, ] * len(x_plot), name='Threshold')
    
    data = [trace, threshold]
    layout = {'title': 'How number of estimators affect the model quality'}
    
    fig = go.Figure(data=data, layout=layout)
    iplot(fig, show_link=False)

In [37]:
get_tree_number(X, y, 50, 0.52)

In [33]:
print('When N = 21 r2 on CV is equal 0.5205...')

When N = 21 r2 on CV is equal 0.5205...
