In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier  # For classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

# Load your data (replace 'your_data.csv' with your actual file path)
data_rf = pd.read_csv('pbp_home_final')

In [2]:
data_rf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1121 entries, 0 to 1120
Data columns (total 75 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   old_game_id       1121 non-null   int64  
 1   shotgun           1121 non-null   float64
 2   total_home_score  1121 non-null   float64
 3   air_yards         1121 non-null   float64
 4   yards_gained      1121 non-null   float64
 5   game_date         1121 non-null   object 
 6   AirYards_Avg      1121 non-null   float64
 7   shotgun_Avg       1121 non-null   float64
 8   yards_gained_Avg  1121 non-null   float64
 9   home_total_Avg    1121 non-null   float64
 10  year              1121 non-null   int64  
 11  home_team_ARI     1121 non-null   bool   
 12  home_team_ATL     1121 non-null   bool   
 13  home_team_BAL     1121 non-null   bool   
 14  home_team_BUF     1121 non-null   bool   
 15  home_team_CAR     1121 non-null   bool   
 16  home_team_CHI     1121 non-null   bool   


In [7]:
# Split data into features (X) and target variable (y)
X = data_rf.drop(columns = ['total_home_score','shotgun', 'air_yards',
                 'yards_gained', 'game_date'])  # Replace 'target_column' with your actual target column name
y = data_rf['total_home_score']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
# Create a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # Adjust n_estimators as needed

# Train the model on the training data
rf_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test_scaled)

In [9]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report for detailed evaluation
print(classification_report(y_test, y_pred))

Accuracy: 0.06222222222222222
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         1
         3.0       0.00      0.00      0.00         3
         6.0       0.00      0.00      0.00         2
         7.0       0.00      0.00      0.00         5
         9.0       0.00      0.00      0.00         4
        10.0       0.00      0.00      0.00        11
        12.0       0.00      0.00      0.00         2
        13.0       0.00      0.00      0.00         7
        14.0       0.00      0.00      0.00         5
        15.0       0.00      0.00      0.00         1
        16.0       0.09      0.20      0.13         5
        17.0       0.00      0.00      0.00        17
        18.0       0.00      0.00      0.00         1
        19.0       0.00      0.00      0.00         6
        20.0       0.06      0.12      0.08        17
        21.0       0.12      0.14      0.13         7
        22.0       0.17      0.25      0.20        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]}
grid_search = GridSearchCV(rf_model, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)
best_params = grid_search.best_params_

rf_model_bestparam = RandomForestClassifier(**best_params, random_state=42)  # Adjust n_estimators as needed
rf_model_bestparam.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = rf_model_bestparam.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report for detailed evaluation
print(classification_report(y_test, y_pred))



Accuracy: 0.08888888888888889
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         1
         3.0       0.00      0.00      0.00         3
         6.0       0.00      0.00      0.00         2
         7.0       0.00      0.00      0.00         5
         9.0       0.00      0.00      0.00         4
        10.0       0.00      0.00      0.00        11
        12.0       0.00      0.00      0.00         2
        13.0       0.00      0.00      0.00         7
        14.0       0.00      0.00      0.00         5
        15.0       0.00      0.00      0.00         1
        16.0       0.00      0.00      0.00         5
        17.0       0.12      0.12      0.12        17
        18.0       0.00      0.00      0.00         1
        19.0       0.00      0.00      0.00         6
        20.0       0.08      0.71      0.15        17
        21.0       0.00      0.00      0.00         7
        22.0       0.00      0.00      0.00        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
