In [8]:
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd 
from mypackage import cleaning_functions
from mypackage import scraping_functions as sf

In [9]:
game_url = 'https://www.pro-football-reference.com/boxscores/202309070kan.htm'
df = sf.scrape_game_data(game_url)

Chiefs vs. Lions 
 total plays: 132


In [19]:
df.head()

Unnamed: 0,Quarter,Time,Down,ToGo,Location,DET,KAN,Detail,EPB,EPA,field_side,yardline,play_start_time,Play_Type,possession,Yardage
1,1,15:00,,,KAN 35,0,0,"Harrison Butker kicks off 65 yards, touchback.",0.0,0.61,KAN,35,0.0,Special Teams,DET,0.0
2,1,15:00,1.0,10.0,DET 25,0,0,David Montgomery up the middle for 7 yards (ta...,0.61,1.01,DET,25,0.0,Run,DET,7.0
3,1,14:29,2.0,3.0,DET 32,0,0,Jared Goff pass incomplete short right,1.01,0.3,DET,32,0.516667,Pass,DET,0.0
4,1,14:25,3.0,3.0,DET 32,0,0,Jared Goff pass incomplete deep right intended...,0.3,-1.24,DET,32,0.583333,Pass,DET,0.0
5,1,14:19,4.0,3.0,DET 32,0,0,"Jack Fox punts 61 yards, returned by Richie Ja...",-1.24,0.32,DET,32,0.683333,Special Teams,DET,0.0


In [17]:
# seconds_left function
def seconds_left(plays):
    seconds_left = abs(plays['play_start_time'] - 60) * 60
    return seconds_left

In [26]:
def score_diff(plays):
    return plays.iloc[:,5].astype(float) - plays.iloc[:,6].astype(float)

In [36]:
# make adjusted score
#adjusted score = score / (seconds_left + 1) ^ gamma
def adjusted_score_calc(plays):
    gamma = .5

    adjusted_score = plays['score_diff'] / ((plays['seconds_left']) + 1) ** gamma

    return adjusted_score

In [34]:
df['score_diff'] = df.iloc[:,5].astype(float) - df.iloc[:,6].astype(float)

In [37]:
test_column = df.apply(adjusted_score_calc,axis =1)

In [13]:
df.tail()

Unnamed: 0,Quarter,Time,Down,ToGo,Location,DET,KAN,Detail,EPB,EPA,field_side,yardline,play_start_time,Play_Type,possession,Yardage
135,4,1:53,2,5,KAN 25,21,20,David Montgomery right guard for 3 yards (tack...,3.71,3.4,KAN,25,58.116667,Run,DET,3.0
137,4,1:47,3,2,KAN 22,21,20,David Montgomery up the middle for 2 yards (ta...,3.4,4.24,KAN,22,58.216667,Run,DET,2.0
139,4,1:42,1,10,KAN 20,21,20,Jared Goff kneels for -1 yards,4.24,3.56,KAN,20,58.3,Run,DET,-1.0
140,4,1:05,2,11,KAN 21,21,20,Jared Goff kneels for -1 yards,3.56,2.74,KAN,21,58.916667,Run,DET,-1.0
141,4,0:39,3,12,KAN 22,21,20,Jared Goff kneels for -1 yards,2.74,1.93,KAN,22,59.35,Run,DET,


In [137]:
def win(plays):
    #plays = pd.DataFrame(plays)
    winning_team = None

    # Get the column names dynamically
    home_team = plays.columns[6]
    vis_team = plays.columns[5]

    Y = []

    # Compare who is the winning team
    if plays.iloc[-1, 5] >= plays.iloc[-1, 6]:
        winning_team = vis_team
    else:
        winning_team = home_team
        

    # Figure out who is winning at the end
    for _, play in plays.iterrows():
        if play['possession'] == winning_team:
            Y.append(1)
        else:
            Y.append(0)
        
    
    return Y


'21'

In [146]:
test_column1 = df.apply(lambda row: win(pd.DataFrame(row).transpose()), axis=1)
test_column1 = test_column1.apply(lambda x:float( x[0]))


In [147]:
test_column1.head(10)

1     1.0
2     1.0
3     1.0
4     1.0
5     1.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
dtype: float64

Down The current down (1st, 2nd, 3rd, or 4th)
Score Difference in score between the two teams
Seconds Number of seconds remaining in the game
AdjustedScore Score/ 1 Seconds+
Spread Las Vegas pre-game point spread
TOTp Total points scored
Yardline Yards from own goal line
YTG Yards to go for a first down

In [149]:
df['seconds_left'] = df.apply(seconds_left,axis =1)
df['score_diff'] = df.iloc[:,5].astype(float) - df.iloc[:,6].astype(float)
df['adjusted_score'] = df.apply(adjusted_score_calc,axis=1)
df['win_team'] = df.apply(lambda row: win(pd.DataFrame(row).transpose()), axis=1)
# Define the column indices to drop
columns_to_drop = [5, 6]

# Drop the specified columns
new_data = df.drop(columns=df.columns[columns_to_drop])

# Optionally, you can also drop additional columns
additional_columns_to_drop = ['Quarter', 'Detail', 'Location', 'Time', 'win_team', 'play_start_time']
new_data = new_data.drop(columns=additional_columns_to_drop)
new_data['field_side'] = pd.Categorical(new_data['field_side'])
new_data['Play_Type'] = pd.Categorical(new_data['Play_Type'])
new_data['possession'] = pd.Categorical(new_data['possession'])
new_data['Down'] = pd.Categorical(new_data['Down'])
new_data['ToGo'] = pd.to_numeric(new_data['ToGo'], errors='coerce')
new_data['ToGo'] = new_data['ToGo'].astype(float)
new_data['EPB'] = new_data['EPB'].astype(float)
new_data['EPA'] = new_data['EPA'].astype(float)
new_data = pd.get_dummies(new_data, columns=['Play_Type', 'Down', 'field_side', 'possession'])
y = df['win_team']
y = y.apply(lambda x:float( x[0]))

In [150]:
y

1      1.0
2      1.0
3      1.0
4      1.0
5      1.0
      ... 
135    1.0
137    1.0
139    1.0
140    1.0
141    1.0
Name: win_team, Length: 132, dtype: float64

In [133]:
new_data.dtypes

ToGo                       float64
EPB                        float64
EPA                        float64
yardline                     int64
Yardage                    float64
seconds_left               float64
score_diff                 float64
adjusted_score             float64
Play_Type_Pass               uint8
Play_Type_Run                uint8
Play_Type_Special Teams      uint8
Down_                        uint8
Down_1                       uint8
Down_2                       uint8
Down_3                       uint8
Down_4                       uint8
field_side_DET               uint8
field_side_KAN               uint8
possession_DET               uint8
possession_KAN               uint8
dtype: object

In [153]:
#your_data_encoded = pd.get_dummies(new_data, columns=['Play_Type', 'Down', 'field_side', 'possession'])
new_data = new_data.fillna(0)
new_data

Unnamed: 0,ToGo,EPB,EPA,yardline,Yardage,seconds_left,score_diff,adjusted_score,Play_Type_Pass,Play_Type_Run,Play_Type_Special Teams,Down_,Down_1,Down_2,Down_3,Down_4,field_side_DET,field_side_KAN,possession_DET,possession_KAN
1,0.0,0.00,0.61,35,0.0,3600.0,0.0,0.000000,0,0,1,1,0,0,0,0,0,1,1,0
2,10.0,0.61,1.01,25,7.0,3600.0,0.0,0.000000,0,1,0,0,1,0,0,0,1,0,1,0
3,3.0,1.01,0.30,32,0.0,3569.0,0.0,0.000000,1,0,0,0,0,1,0,0,1,0,1,0
4,3.0,0.30,-1.24,32,0.0,3565.0,0.0,0.000000,1,0,0,0,0,0,1,0,1,0,1,0
5,3.0,-1.24,0.32,32,0.0,3559.0,0.0,0.000000,0,0,1,0,0,0,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,5.0,3.71,3.40,25,3.0,113.0,1.0,0.093659,0,1,0,0,0,1,0,0,0,1,1,0
137,2.0,3.40,4.24,22,2.0,107.0,1.0,0.096225,0,1,0,0,0,0,1,0,0,1,1,0
139,10.0,4.24,3.56,20,-1.0,102.0,1.0,0.098533,0,1,0,0,1,0,0,0,0,1,1,0
140,11.0,3.56,2.74,21,-1.0,65.0,1.0,0.123091,0,1,0,0,0,1,0,0,0,1,1,0


In [154]:
X_train, X_test, y_train, y_test = train_test_split(new_data, y, test_size=.2, random_state=42)

In [155]:
rf_classifier = RandomForestClassifier(n_estimators=400, random_state=42)
rf_classifier.fit(X_train, y_train)

In [156]:
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy



0.9629629629629629