In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report



In [3]:
projected_standings_awards = pd.read_csv('team_records/nextyear_awards-playoffs.csv', index_col=[0])
projected_stats = pd.read_csv('team_records/projected_stats.csv', index_col = [0])

In [4]:
projected_stats.dropna(inplace=True)
projected_stats.info()

<class 'pandas.core.frame.DataFrame'>
Index: 663 entries, 1 to 701
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Tm                  663 non-null    object 
 1   Year                663 non-null    int64  
 2   ProjectedQBOverall  663 non-null    float64
 3   LastYrQBOverall     663 non-null    float64
 4   ProjectedWROverall  663 non-null    float64
 5   LastYrWROverall     663 non-null    float64
 6   ProjectedRBOverall  663 non-null    float64
 7   LastYrRBOverall     663 non-null    float64
 8   D_Overall           663 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 51.8+ KB


In [5]:
data = projected_stats.merge(projected_standings_awards, how = 'outer', on = ['Tm', 'Year'])
data.dropna(inplace=True)

In [7]:
target = 'MakePlayoffsNextYear'
features = [col for col in data.columns if col not in [target, 'Tm', 'Year']]

X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
}

xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')


In [12]:
rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
rf_grid_search.fit(X_train_scaled, y_train)
rf_best_model = rf_grid_search.best_estimator_
rf_best_params = rf_grid_search.best_params_



In [13]:
rf_best_pred = rf_best_model.predict(X_test_scaled)
rf_best_accuracy = accuracy_score(y_test, rf_best_pred)
print("Random Forest Best Parameters:", rf_best_params)
print("Random Forest Accuracy:", rf_best_accuracy)

Random Forest Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest Accuracy: 0.5714285714285714


In [14]:
print(classification_report(y_test, rf_best_pred))

              precision    recall  f1-score   support

         0.0       0.62      0.78      0.69        82
         1.0       0.40      0.24      0.30        51

    accuracy                           0.57       133
   macro avg       0.51      0.51      0.49       133
weighted avg       0.54      0.57      0.54       133



In [15]:
gb_grid_search = GridSearchCV(gb_model, gb_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
gb_grid_search.fit(X_train_scaled, y_train)
gb_best_model = gb_grid_search.best_estimator_
gb_best_params = gb_grid_search.best_params_

In [16]:
gb_best_pred = gb_best_model.predict(X_test_scaled)
gb_best_accuracy = accuracy_score(y_test, gb_best_pred)
print("Gradient Boosting Best Parameters:", gb_best_params)
print("Gradient Boosting Accuracy:", gb_best_accuracy)

Gradient Boosting Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Gradient Boosting Accuracy: 0.5639097744360902


In [17]:
print(classification_report(y_test, gb_best_pred))

              precision    recall  f1-score   support

         0.0       0.62      0.77      0.68        82
         1.0       0.39      0.24      0.29        51

    accuracy                           0.56       133
   macro avg       0.50      0.50      0.49       133
weighted avg       0.53      0.56      0.53       133



In [18]:
xgb_grid_search = GridSearchCV(xgb_model, xgb_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
xgb_grid_search.fit(X_train_scaled, y_train)
xgb_best_model = xgb_grid_search.best_estimator_
xgb_best_params = xgb_grid_search.best_params_


Parameters: { "use_label_encoder" } are not used.



In [19]:
xgb_best_pred = xgb_best_model.predict(X_test_scaled)
xgb_best_accuracy = accuracy_score(y_test, xgb_best_pred)
print("XGBoost Best Parameters:", xgb_best_params)
print("XGBoost Accuracy:", xgb_best_accuracy)

XGBoost Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}
XGBoost Accuracy: 0.5864661654135338


In [20]:
print(classification_report(y_test, xgb_best_pred))

              precision    recall  f1-score   support

         0.0       0.62      0.84      0.72        82
         1.0       0.41      0.18      0.25        51

    accuracy                           0.59       133
   macro avg       0.52      0.51      0.48       133
weighted avg       0.54      0.59      0.54       133

