## Fitting Random Forest Model - Primary

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('500hits.csv', encoding='latin-1')

In [3]:
df.head()

Unnamed: 0,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA,HOF
0,Ty Cobb,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,178,0.366,1
1,Stan Musial,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,31,0.331,1
2,Tris Speaker,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,129,0.345,1
3,Derek Jeter,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,97,0.31,1
4,Honus Wagner,21,2792,10430,1736,3430,640,252,101,0,963,327,722,15,0.329,1


In [4]:
df = df.drop(columns=['PLAYER','CS'])

In [5]:
X = df.iloc[:, 0:13]

In [6]:
y = df.iloc[:,13]

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=17, test_size=0.2)

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [11]:
rf = RandomForestClassifier()

In [12]:
rf.fit(X_train, y_train)

In [13]:
y_pred = rf.predict(X_test)

In [14]:
rf.score(X_test, y_test)

0.8172043010752689

In [15]:
from sklearn.metrics import classification_report

In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.87        61
           1       0.78      0.66      0.71        32

    accuracy                           0.82        93
   macro avg       0.81      0.78      0.79        93
weighted avg       0.81      0.82      0.81        93



## Feature Importance

In [17]:
features = pd.DataFrame(rf.feature_importances_, index=X.columns)

In [18]:
features

Unnamed: 0,0
YRS,0.026651
G,0.081594
AB,0.089247
R,0.126648
H,0.141692
2B,0.060808
3B,0.045748
HR,0.049477
RBI,0.095918
BB,0.037734


## Using Hyperparameters

In [21]:
rf2 = RandomForestClassifier(n_estimators=1000,
                            criterion='entropy',
                            min_samples_split=10,
                            max_depth=14,
                            random_state=42)

In [22]:
rf2.fit(X_train, y_train)

In [23]:
rf2.score(X_test, y_test)

0.8494623655913979

In [24]:
y_pred_2 = rf2.predict(X_test)

In [26]:
print(classification_report(y_test, y_pred_2))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89        61
           1       0.85      0.69      0.76        32

    accuracy                           0.85        93
   macro avg       0.85      0.81      0.82        93
weighted avg       0.85      0.85      0.85        93

