In [2]:
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd 
from mypackage import cleaning_functions
from mypackage import scraping_functions as sf

In [31]:
game_url = 'https://www.pro-football-reference.com/boxscores/202309070kan.htm'
df = sf.scrape_game_data(game_url)

# seconds_left function
def seconds_left(plays):
    seconds_left = abs(plays['play_start_time'] - 60) * 60
    return seconds_left


def score_diff(plays):
    return plays.iloc[:,5].astype(float) - plays.iloc[:,6].astype(float)


# make adjusted score
#adjusted score = score / (seconds_left + 1) ^ gamma
def adjusted_score_calc(plays):
    gamma = .5

    adjusted_score = plays['score_diff'] / ((plays['seconds_left']) + 1) ** gamma

    return adjusted_score


def win(plays):
    #plays = pd.DataFrame(plays)
    winning_team = None

    # Get the column names dynamically
    home_team = plays.columns[6]
    vis_team = plays.columns[5]

    Y = []

    # Compare who is the winning team
    if plays.iloc[-1, 5] >= plays.iloc[-1, 6]:
        winning_team = vis_team
    else:
        winning_team = home_team
        

    # Figure out who is winning at the end
    for _, play in plays.iterrows():
        if play['possession'] == winning_team:
            Y.append(1)
        else:
            Y.append(0)
        
    
    return Y

Chiefs vs. Lions 
 total plays: 132


Down The current down (1st, 2nd, 3rd, or 4th)
Score Difference in score between the two teams
Seconds Number of seconds remaining in the game
AdjustedScore Score/ 1 Seconds+
Spread Las Vegas pre-game point spread
TOTp Total points scored
Yardline Yards from own goal line
YTG Yards to go for a first down

In [22]:
df['seconds_left'] = df.apply(seconds_left,axis =1)
df['score_diff'] = df.iloc[:,5].astype(float) - df.iloc[:,6].astype(float)
df['adjusted_score'] = df.apply(adjusted_score_calc,axis=1)
df['win_team'] = df.apply(lambda row: win(pd.DataFrame(row).transpose()), axis=1)
# Define the column indices to drop
columns_to_drop = [5, 6]

# Drop the specified columns
new_data = df.drop(columns=df.columns[columns_to_drop])

# Optionally, you can also drop additional columns
additional_columns_to_drop = ['Quarter', 'Detail', 'Location', 'Time', 'win_team', 'play_start_time']
new_data = new_data.drop(columns=additional_columns_to_drop)
new_data['field_side'] = pd.Categorical(new_data['field_side'])
new_data['Play_Type'] = pd.Categorical(new_data['Play_Type'])
new_data['possession'] = pd.Categorical(new_data['possession'])
new_data['Down'] = pd.Categorical(new_data['Down'])
new_data['ToGo'] = pd.to_numeric(new_data['ToGo'], errors='coerce')
new_data['ToGo'] = new_data['ToGo'].astype(float)
new_data['EPB'] = new_data['EPB'].astype(float)
new_data['EPA'] = new_data['EPA'].astype(float)
new_data = pd.get_dummies(new_data, columns=['Play_Type', 'Down', 'field_side', 'possession'])
y = df['win_team']
y = y.apply(lambda x:float( x[0]))
new_data = new_data.fillna(0)

In [23]:
new_data.dtypes

ToGo                       float64
EPB                        float64
EPA                        float64
yardline                     int32
Yardage                    float64
seconds_left               float64
score_diff                 float64
adjusted_score             float64
Play_Type_Pass               uint8
Play_Type_Run                uint8
Play_Type_Special Teams      uint8
Down_                        uint8
Down_1                       uint8
Down_2                       uint8
Down_3                       uint8
Down_4                       uint8
field_side_DET               uint8
field_side_KAN               uint8
possession_DET               uint8
possession_KAN               uint8
dtype: object

In [24]:
new_data

Unnamed: 0,ToGo,EPB,EPA,yardline,Yardage,seconds_left,score_diff,adjusted_score,Play_Type_Pass,Play_Type_Run,Play_Type_Special Teams,Down_,Down_1,Down_2,Down_3,Down_4,field_side_DET,field_side_KAN,possession_DET,possession_KAN
1,0.0,0.00,0.61,35,0.0,3600.0,0.0,0.000000,0,0,1,1,0,0,0,0,0,1,1,0
2,10.0,0.61,1.01,25,7.0,3600.0,0.0,0.000000,0,1,0,0,1,0,0,0,1,0,1,0
3,3.0,1.01,0.30,32,0.0,3569.0,0.0,0.000000,1,0,0,0,0,1,0,0,1,0,1,0
4,3.0,0.30,-1.24,32,0.0,3565.0,0.0,0.000000,1,0,0,0,0,0,1,0,1,0,1,0
5,3.0,-1.24,0.32,32,0.0,3559.0,0.0,0.000000,0,0,1,0,0,0,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,5.0,3.71,3.40,25,3.0,113.0,1.0,0.093659,0,1,0,0,0,1,0,0,0,1,1,0
137,2.0,3.40,4.24,22,2.0,107.0,1.0,0.096225,0,1,0,0,0,0,1,0,0,1,1,0
139,10.0,4.24,3.56,20,-1.0,102.0,1.0,0.098533,0,1,0,0,1,0,0,0,0,1,1,0
140,11.0,3.56,2.74,21,-1.0,65.0,1.0,0.123091,0,1,0,0,0,1,0,0,0,1,1,0


In [25]:
X_train, X_test, y_train, y_test = train_test_split(new_data, y, test_size=.2, random_state=42)

In [26]:
rf_classifier = RandomForestClassifier(n_estimators=400, random_state=42)
rf_classifier.fit(X_train, y_train)

In [27]:
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy



0.9629629629629629

array([0., 1., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 0., 0., 1.])

In [29]:
df = new_data
df['winner'] = y

In [39]:
X_train

Unnamed: 0,ToGo,EPB,EPA,yardline,Yardage,seconds_left,score_diff,adjusted_score,Play_Type_Pass,Play_Type_Run,Play_Type_Special Teams,Down_,Down_1,Down_2,Down_3,Down_4,field_side_DET,field_side_KAN,possession_DET,possession_KAN
102,22.0,-1.64,0.43,13,20.0,1021.0,-3.0,-0.093842,1,0,0,0,0,1,0,0,1,0,1,0
103,2.0,0.43,-1.18,33,0.0,981.0,-3.0,-0.095734,1,0,0,0,0,0,1,0,1,0,1,0
1,0.0,0.00,0.61,35,0.0,3600.0,0.0,0.000000,0,0,1,1,0,0,0,0,0,1,1,0
13,7.0,-1.07,-2.32,12,5.0,3306.0,0.0,0.000000,1,0,0,0,0,0,1,0,1,0,1,0
133,25.0,-1.37,-3.58,30,5.0,129.0,1.0,0.087706,1,0,0,0,0,0,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,10.0,-1.42,-2.49,7,4.0,1653.0,-7.0,-0.172120,1,0,0,0,0,0,1,0,1,0,1,0
113,7.0,-0.23,-0.89,28,-5.0,647.0,-6.0,-0.235702,0,1,0,0,0,0,1,0,1,0,1,0
15,10.0,0.28,-0.13,20,1.0,3220.0,0.0,0.000000,0,1,0,0,1,0,0,0,1,0,1,0
99,2.0,3.85,2.44,17,8.0,1038.0,0.0,0.000000,1,0,0,0,0,0,1,0,1,0,0,1
