# Reading in Polars and Data

In [1]:
import polars as pl
pl.Config.with_columns_kwargs = True

In [2]:
batting = pl.read_csv('Data/lahman baseball data/Batting.csv')
pitching = pl.read_csv('Data/lahman baseball data/Pitching.csv')
fielding = pl.read_csv('Data/lahman baseball data/Fielding.csv')
awards = pl.read_csv('Data/lahman baseball data/AwardsPlayers.csv')
salaries = pl.read_csv('Data/lahman baseball data/Salaries.csv')
salaries

yearID,teamID,lgID,playerID,salary
i64,str,str,str,i64
1985,"""ATL""","""NL""","""barkele01""",870000
1985,"""ATL""","""NL""","""bedrost01""",550000
1985,"""ATL""","""NL""","""benedbr01""",545000
1985,"""ATL""","""NL""","""campri01""",633333
1985,"""ATL""","""NL""","""ceronri01""",625000
…,…,…,…,…
2016,"""WAS""","""NL""","""strasst01""",10400000
2016,"""WAS""","""NL""","""taylomi02""",524000
2016,"""WAS""","""NL""","""treinbl01""",524900
2016,"""WAS""","""NL""","""werthja01""",21733615


# Gold Glove Data Set

In [7]:
'case is player, year, pos, league'

'case is player, year, pos, league'

In [3]:
fielding_awards = (fielding
.filter((pl.col('yearID') >= 2013))
.join(
     awards.rename({"notes": "POS"}) 
           .select(['playerID', 'yearID', 'awardID', 'lgID', 'POS'])  
           .filter((pl.col('awardID') == 'Gold Glove') & (pl.col('yearID') >= 2013)),
      on=['playerID', 'yearID', 'lgID'],
      how='left'
     )
.group_by(['playerID','yearID', 'lgID', 'POS', 'awardID'])
.agg(
     pl.col('InnOuts').sum().alias('InnOuts'),
     pl.col('PO').sum().alias('PO'),
     pl.col('A').sum().alias('A'),
     pl.col('E').sum().alias('E'),
     pl.col('DP').sum().alias('DP'),
     pl.col('PB').cast(int).sum().alias('PB'),
     pl.col('WP').cast(int).sum().alias('WP'),
     pl.col('SB').cast(int).sum().alias('SB'),
     pl.col('CS').cast(int).sum().alias('CS'),
     pl.col('ZR').cast(int).sum().alias('ZR') 
    )    
.with_columns(
        pl.when(pl.col('awardID').is_null())
          .then(pl.lit('No'))
          .otherwise(pl.lit('Yes'))
          .alias('Gold Glove?'),
         pl.when(pl.col('yearID') == 2023)
          .then(pl.lit('Validation'))
          .otherwise(pl.lit('Training'))
          .alias('Training-Validation')
              )
.drop(pl.col('awardID'))
                  )
# fielding_awards = fielding_awards.filter(pl.col('Gold Glove?') == 'Yes', pl.col('POS') == 'C', pl.col('yearID') == 2017)
fielding_awards.write_csv('data/fielding_awards.csv')

fielding_awards

playerID,yearID,lgID,POS,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR,Gold Glove?,Training-Validation
str,i64,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str
"""smolija01""",2015,"""AL""","""OF""",1307,89,5,0,1,0,0,0,0,0,"""No""","""Training"""
"""cahiltr01""",2013,"""NL""","""P""",440,4,36,0,3,0,0,0,0,0,"""No""","""Training"""
"""bassan01""",2019,"""AL""","""P""",144,5,6,0,1,0,0,0,0,0,"""No""","""Training"""
"""brantro01""",2017,"""AL""","""C""",126,23,1,0,0,0,0,4,1,0,"""No""","""Training"""
"""mossbr01""",2017,"""AL""","""OF""",129,5,0,0,0,0,0,0,0,0,"""No""","""Training"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""blanche01""",2013,"""AL""","""C""",1104,331,17,3,4,10,0,13,6,0,"""No""","""Training"""
"""herreke01""",2017,"""AL""","""P""",178,2,10,0,0,0,0,0,0,0,"""No""","""Training"""
"""guduare01""",2018,"""AL""","""P""",10,0,0,0,0,0,0,0,0,0,"""No""","""Training"""
"""difowi01""",2016,"""NL""","""2B""",195,6,19,1,1,0,0,0,0,0,"""No""","""Training"""


# Salary Dataset

In [4]:
'case is player year' 'salary, batting, fielding, pitching'

'case is player yearsalary, batting, fielding, pitching'

In [5]:
salaries_final = (salaries
.filter(pl.col('yearID') >= 2006)
.join(fielding
        .filter(pl.col('yearID') >= 2006),
        on=['playerID', 'yearID', 'lgID', 'teamID'],
        how='left'
     )
.join(batting
        .rename({"CS": "CS_batting", "SB": "SB_Batting"})
        .filter(pl.col('yearID') >= 2006),
        on=['playerID', 'yearID', 'teamID', 'stint', 'lgID', 'G'],
        how='left',
     )
.join(pitching
        .rename({
            "GIDP": "GIDP_pitching", "H": "H_pitching", "HR": "HR_pitching", "BB": "BB_pitching",
            "SO": "SO_pitching", "IBB": "IBB_pitching", "WP": "WP_pitching", "HBP": "HBP_pitching",
            "SH": "SH_pitching", "SF": "SF_pitching", "R": "R_pitching", "GIDP": "GIDP_pitching"
                })
        .filter(pl.col('yearID') >= 2006),
        on=['playerID', 'yearID', 'teamID', 'stint', 'lgID', 'G', 'GS'],
        how='left'
     )
.group_by(['playerID','yearID'])
.agg(
     pl.col('salary').sum().alias('salary'),
     pl.col('InnOuts').sum().alias('InnOuts'),
     pl.col('PO').sum().alias('PO'),
     pl.col('A').sum().alias('A'),
     pl.col('E').sum().alias('E'),
     pl.col('DP').sum().alias('DP'),
     pl.col('PB').cast(int).sum().alias('PB'),
     pl.col('WP').cast(int).sum().alias('WP'),
     pl.col('SB').cast(int).sum().alias('SB'),
     pl.col('CS').cast(int).sum().alias('CS'),
     pl.col('ZR').cast(int).sum().alias('ZR'),
     pl.col('G').sum().alias('G'),
     pl.col('AB').sum().alias('AB'),
     pl.col('R').sum().alias('R'),
     pl.col('H').sum().alias('H'),
     pl.col('2B').sum().alias('2B'),
     pl.col('3B').sum().alias('3B'),
     pl.col('HR').sum().alias('HR'),
     pl.col('SB_Batting').sum().alias('SB_batting'),
     pl.col('CS_batting').sum().alias('CS_batting'),
     pl.col('BB').sum().alias('BB_batting'),
     pl.col('SO').sum().alias('SO_batting'),
     pl.col('IBB').sum().alias('IBB_batting'),
     pl.col('HBP').sum().alias('HBP_batting'),
     pl.col('SH').sum().alias('Sacrifice Hits'),
     pl.col('SF').sum().alias('Sacrifice Flies'),
     pl.col('GIDP').sum().alias('GIDP'),
     pl.col('GS').sum().alias('GS'),
     pl.col('CG').sum().alias('CG'),
     pl.col('SHO').sum().alias('SHO'),
     pl.col('SV').sum().alias('SV'),
     pl.col('IPouts').sum().alias('IPOuts'),
     pl.col('H_pitching').sum().alias('H_pitching'),
     pl.col('ER').sum().alias('ER_pitching'),
     pl.col('HR_pitching').sum().alias('HR_pitching'),
     pl.col('BB_pitching').sum().alias('BB_pitching'),
     pl.col('SO_pitching').sum().alias('SO_pitching'),
     pl.col('BAOpp').sum().alias('BAOpp'),
     pl.col('ERA').sum().alias('ERA_pitching'),
     pl.col('IBB_pitching').sum().alias('IBB_pitching'),
     pl.col('WP_pitching').sum().alias('WP_pitching'),
     pl.col('HBP_pitching').sum().alias('HBP_pitching'),
     pl.col('BK').sum().alias('Balk'),
     pl.col('BFP').sum().alias('Batters Faced by Pitcher'),
     pl.col('GF').sum().alias('Games Finished'),
     pl.col('SH').sum().alias('SH_pitching'),
     pl.col('SF').sum().alias('SF_pitching'),
     pl.col('GIDP_pitching').sum().alias('GIDP_pitching')
    )   
.with_columns(
         pl.when(pl.col('yearID') == 2016)
          .then(pl.lit('Validation'))
          .otherwise(pl.lit('Training'))
          .alias('Training-Validation')
               )
)
salaries_final.write_csv('data/salaries_final.csv')
salaries_final

playerID,yearID,salary,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR,G,AB,R,H,2B,3B,HR,SB_batting,CS_batting,BB_batting,SO_batting,IBB_batting,HBP_batting,Sacrifice Hits,Sacrifice Flies,GIDP,GS,CG,SHO,SV,IPOuts,H_pitching,ER_pitching,HR_pitching,BB_pitching,SO_pitching,BAOpp,ERA_pitching,IBB_pitching,WP_pitching,HBP_pitching,Balk,Batters Faced by Pitcher,Games Finished,SH_pitching,SF_pitching,GIDP_pitching,Training-Validation
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str
"""sloweke01""",2010,470000,467,11,12,2,0,0,0,0,0,0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,28,0,0,0,467,172,77,21,29,116,0.28,4.45,0,3,4,0,662,0,0,0,7,"""Training"""
"""callaal01""",2009,1246500,4023,245,392,17,98,0,0,0,0,0,161,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,153,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,"""Training"""
"""perezra01""",2010,795000,183,3,10,1,0,0,0,0,0,0,70,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,183,72,22,3,25,36,0.3,3.25,4,6,0,0,272,13,0,0,11,"""Training"""
"""pettian01""",2009,5500000,584,3,27,2,1,0,0,0,0,0,32,5,1,1,1,0,0,0,0,0,2,0,0,1,0,0,32,0,0,0,584,193,90,20,76,148,0.259,4.16,1,3,4,0,834,0,1,0,16,"""Training"""
"""braunry02""",2009,1032500,4092,304,8,2,2,0,0,0,0,0,158,635,113,203,39,6,32,20,6,57,121,1,13,0,3,6,156,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,3,0,"""Training"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""wilheto01""",2012,482900,238,4,8,2,0,0,0,0,0,0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,29,238,59,22,5,29,87,0.202,2.5,3,3,2,0,326,48,0,0,5,"""Training"""
"""moseldu01""",2007,380500,276,4,4,1,1,0,0,0,0,0,46,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,276,97,45,7,27,50,0.277,4.4,3,6,3,1,383,13,0,0,13,"""Training"""
"""byrdma01""",2012,6500000,315,35,0,1,0,0,0,0,0,0,13,43,1,3,0,0,0,0,1,3,10,1,1,0,0,2,13,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,"""Training"""
"""floreje02""",2008,400000,2019,474,29,5,6,7,0,47,17,0,82,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,78,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,"""Training"""


# Classification Problem

## Setting Up

In [6]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [7]:
from sklearn.metrics import confusion_matrix

def get_metrics(y_true, y_pred, labels=None):
    """ Compute various measures from a confusion matrix

    Args:
        y_true: a column of the true labels.
        y_pred: a column of the predicted labels.
    Returns:
        A dictionary of metrics.
    """
    conf_mat = confusion_matrix(y_true, y_pred, labels=labels)
    tn, fp, fn, tp = conf_mat.ravel()

    return {'accuracy'          :  (tp + tn)/(tp + tn + fp + fn),
            'balanced_accuracy' :  0.5*tn / (tn+fp) + 0.5*tp / (tp+fn),
            'specificity'       :  tn / (tn+fp),
            'sensitivity'       :  tp / (tp+fn),
             'ppv'              :  tp / (tp + fp),
             'npv'              :  tn / (tn + fn)}


In [23]:
fielding_awards

playerID,yearID,lgID,POS,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR,Gold Glove?,Training-Validation
str,i64,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str
"""topaju01""",2022,"""NL""","""P""",22,1,4,0,0,0,0,0,0,0,"""No""","""Training"""
"""whitemi03""",2022,"""NL""","""P""",168,5,4,0,0,0,0,0,0,0,"""No""","""Training"""
"""andergr01""",2023,"""AL""","""P""",107,3,4,0,1,0,0,0,0,0,"""No""","""Validation"""
"""madrini01""",2022,"""NL""","""2B""",1462,106,144,1,46,0,0,0,0,0,"""No""","""Training"""
"""doteloc01""",2013,"""AL""","""P""",14,0,1,0,0,0,0,0,0,0,"""No""","""Training"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""stallja01""",2023,"""NL""","""P""",24,0,0,0,0,0,0,0,0,0,"""No""","""Validation"""
"""anderti01""",2023,"""AL""","""2B""",54,3,2,1,0,0,0,0,0,0,"""No""","""Validation"""
"""castrja01""",2020,"""AL""","""C""",383,131,7,0,0,1,0,7,4,0,"""No""","""Training"""
"""pearcst01""",2016,"""AL""","""1B""",834,244,22,3,26,0,0,0,0,0,"""No""","""Training"""


In [25]:
fielding_awards.columns

['playerID',
 'yearID',
 'lgID',
 'POS',
 'InnOuts',
 'PO',
 'A',
 'E',
 'DP',
 'PB',
 'WP',
 'SB',
 'CS',
 'ZR',
 'Gold Glove?',
 'Training-Validation']

## Preparing Data

In [28]:
fielding_training = fielding_awards.filter(pl.col('Training-Validation') == 'Training').drop(['Gold Glove?', 'playerID', 'yearID', 'lgID', 'POS', 'Training-Validation'])
gold_glove_train = fielding_awards.filter(pl.col('Training-Validation') == 'Training').drop(['playerID', 'yearID', 'lgID', 'POS', 'Training-Validation']).select('Gold Glove?')

fielding_validation = fielding_awards.filter(pl.col('Training-Validation') == 'Validation').drop(['Gold Glove?', 'playerID', 'yearID', 'lgID', 'POS', 'Training-Validation'])
gold_glove_validation = fielding_awards.filter(pl.col('Training-Validation') == 'Validation').drop(['playerID', 'yearID', 'lgID', 'POS', 'Training-Validation']).select('Gold Glove?')

fielding_stats_train = fielding_training.to_pandas()
gold_glove_train = gold_glove_train.to_pandas()

fielding_stats_validation = fielding_validation.to_pandas()
gold_glove_validation = gold_glove_validation.to_pandas()

## Grid Search

### Tree Grid Search

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import numpy as np

In [32]:
param_grid = {'max_depth': [None, 1,2,3,4,5],
              'min_samples_split' : [1, 5, 10],
               'min_samples_leaf' : [1, 5, 10],
              'class_weight':[None, "balanced"]
              }


In [9]:
from sklearn.model_selection import StratifiedKFold

In [34]:
state = 458

cv_obj = StratifiedKFold(n_splits=10, shuffle=True, random_state=state)

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, verbose=3, cv=cv_obj)

grid_search.fit(fielding_training, gold_glove_train)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
[CV 1/10] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 2/10] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 3/10] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 4/10] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 5/10] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 6/10] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 7/10] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 8/10] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=1;, scor

360 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ben8h\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ben8h\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\ben8h\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\ben8h\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParam

In [38]:
grid_search.best_score_

0.9881267356293654

In [40]:
grid_search.best_params_

{'class_weight': None,
 'max_depth': 1,
 'min_samples_leaf': 1,
 'min_samples_split': 5}

In [42]:
best_model_tree = grid_search.best_estimator_
best_model_tree

### Random Forest Grid Search

In [45]:
rf_param_grid = {'n_estimators': 10 ** np.arange(1, 3),
              'max_depth': [None, 1,2,3,4,5],
              'min_samples_split' : [1, 5, 10],
               'min_samples_leaf' : [1, 5, 10],
              'class_weight':[None, "balanced"]
              }

rf_param_grid

{'n_estimators': array([ 10, 100], dtype=int32),
 'max_depth': [None, 1, 2, 3, 4, 5],
 'min_samples_split': [1, 5, 10],
 'min_samples_leaf': [1, 5, 10],
 'class_weight': [None, 'balanced']}

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, verbose=3, cv=cv_obj)

rf_grid_search.fit(fielding_training, gold_glove_train)

In [None]:
rf_grid_search.best_score_

In [None]:
rf_grid_search.best_params_

In [None]:
best_model_rf = rf_grid_search.best_estimator_
best_model_rf

### Comparing Scores

In [None]:
rf_grid_search.best_score_ > grid_search.best_score_

### Measuring Performance

In [None]:
gold_glove_validation_pred_rf = rf_grid_search.predict(fielding_validation)

In [None]:
get_metrics(gold_glove_validation, gold_glove_validation_pred_rf)

## Fitting the Best Model

In [None]:
forest = RandomForestClassifier(min_samples_leaf=5, min_samples_split=10, n_estimators=10)
forest.fit(fielding_training, gold_glove_train)

## ROC Curve

In [None]:
from sklearn.metrics import RocCurveDisplay

plt.rcParams["figure.figsize"] = (8,6)

RocCurveDisplay.from_estimator(forest, fielding_training, gold_glove_train)

plt.show()

## AUC

In [11]:
def get_class_prob_df(clf, X):
    """ Return a dataframe of class probabilities.

    Arg:
        clf:  A fitted model object
        X:    The matrix of (test) features.
    Returns: A dataframe of class probabilities
    """
    return pd.DataFrame(clf.predict_proba(X), columns=clf.classes_)

In [None]:
y_test_probs = get_class_prob_df(forest, fielding_validation)

y_test_probs

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(gold_glove_validation, y_test_probs['Yes'])

## Balanced Success

In [None]:
forest = RandomForestClassifier(min_samples_leaf=5, min_samples_split=10, n_estimators=10)
forest.fit(fielding_training, gold_glove_train)

y_test_pred = forest.predict(fielding_validation)
y_test_prob = get_class_prob_df(forest, fielding_validation)

In [None]:
forest2 = RandomForestClassifier()
forest2.fit(fielding_training, gold_glove_train)

y_test_pred2 = forest.predict(fielding_validation)
y_test_prob2 = get_class_prob_df(forest2, fielding_validation)

In [None]:
get_metrics(gold_glove_validation, y_test_pred)

In [None]:
get_metrics(gold_glove_validation, y_test_pred2)

In [None]:
ax = plt.gca()

RocCurveDisplay.from_estimator(forest, fielding_validation, gold_glove_validation, ax = ax, name = "Random Forest")
RocCurveDisplay.from_estimator(forest2, fielding_validation, gold_glove_validation, ax = ax, name = "Default Random Forest")

plt.show()

In [None]:
roc_auc_score(gold_glove_validation, y_test_probs['Yes'])

In [None]:
roc_auc_score(gold_glove_validation, y_test_prob2['Yes'])

# Regression Problem

## Preparing Data

In [None]:
salaries.columns

In [12]:
salaries_training = salaries_final.filter(pl.col('Training-Validation') == 'Training').drop('playerID','yearID','salary', 'Training-Validation')
salary_train = salaries_final.filter(pl.col('Training-Validation') == 'Training').drop('playerID','yearID','Training-Validation').select('salary')

salaries_validation = salaries_final.filter(pl.col('Training-Validation') == 'Validation').drop('playerID','yearID','salary', 'Training-Validation')
salary_validation = salaries_final.filter(pl.col('Training-Validation') == 'Validation').drop('playerID','yearID','Training-Validation').select('salary')

salaries_training = salaries_training.to_pandas()
salary_train = salary_train.to_pandas()

salaries_validation = salaries_validation.to_pandas()
salary_validation = salary_validation.to_pandas()


In [13]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

## Tree Grid Search

In [14]:
tree_param_grid = {
    'max_depth': [None, 3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

from sklearn.model_selection import StratifiedKFold

state = 458

cv_obj = StratifiedKFold(n_splits=10, shuffle=True, random_state=state)

grid_search_tree_regressor = GridSearchCV(DecisionTreeRegressor(), tree_param_grid, verbose=3, cv=cv_obj)

grid_search_tree_regressor.fit(salaries_training, salary_train)

Fitting 10 folds for each of 36 candidates, totalling 360 fits




[CV 1/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-0.526 total time=   0.3s
[CV 2/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-1.146 total time=   0.3s
[CV 3/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-0.938 total time=   0.4s
[CV 4/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-0.875 total time=   0.4s
[CV 5/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-0.504 total time=   0.3s
[CV 6/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-0.402 total time=   0.3s
[CV 7/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-1.104 total time=   0.4s
[CV 8/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-0.523 total time=   0.4s
[CV 9/10] END max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=-0.731 total time=   0.4s
[CV 10/10] END max_depth=None, min_samples_leaf=1, min_samples_s

In [15]:
grid_search_tree_regressor.best_score_

np.float64(0.11097351422272048)

In [16]:
grid_search_tree_regressor.best_params_

{'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2}

In [17]:
best_model_tree_regressor = grid_search_tree_regressor.best_estimator_
best_model_tree_regressor

## Forest Grid Search

In [None]:
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

from sklearn.model_selection import StratifiedKFold

state = 458

cv_obj = StratifiedKFold(n_splits=10, shuffle=True, random_state=state)

grid_search_forest_r = GridSearchCV(RandomForestRegressor(), rf_param_grid, verbose=3, cv=cv_obj)

grid_search_forest_r.fit(salaries_training, salary_train)

In [None]:
grid_search_forest_r.best_score_

In [None]:
grid_search_forest_r.best_params_

In [None]:
best_model_forest_r = grid_search_forest_r.best_estimator_
best_model_forest_r

## Comparing Scores

In [None]:
grid_search_forest_r.best_score_ > grid_search_tree_regressor.best_score_

## Measuring Performance

In [None]:
salaries_validation_pred_rf = grid_search_forest_r.predict(salaries_validation)

In [55]:
get_metrics(salary_validation, salaries_validation_pred_rf)

  'ppv'              :  tp / (tp + fp),


{'accuracy': 0.9864668922184631,
 'balanced_accuracy': 0.5,
 'specificity': 1.0,
 'sensitivity': 0.0,
 'ppv': nan,
 'npv': 0.9864668922184631}

## Fitting the Best Model

In [None]:
forest = RandomForestClassifier(min_samples_leaf=5, min_samples_split=10, n_estimators=10)
forest.fit(salaries_training, salary_train)

## Root MSE & $R^2$

In [None]:
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error

In [None]:
y_test_pred = forest.predict(X_test)

y_test_pred

In [None]:
{'R^2': explained_variance_score(y_test, y_test_pred),
 'MSE': mean_squared_error(y_test, y_test_pred)
}