In [991]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import seaborn as sns



In [992]:
scaler = MinMaxScaler()

In [993]:
df = pd.read_excel(r'C:\Users\bryce\OneDrive\Documents\NFLpyproj.xlsx')


In [994]:
print(df)

      Season  Week                  Team  Win?  Home?            Team Played  \
0       2020     1    Kansas City Chiefs     1      1         Houston Texans   
1       2020     1      Seattle Seahawks     1      0        Atlanta Falcons   
2       2020     1         Buffalo Bills     1      1          New York Jets   
3       2020     1     Las Vegas Raiders     1      0      Carolina Panthers   
4       2020     1         Chicago Bears     1      0          Detroit Lions   
...      ...   ...                   ...   ...    ...                    ...   
1593    2022    18        Dallas Cowboys     0      0  Washington Commanders   
1594    2022    18  Los Angeles Chargers     0      0         Denver Broncos   
1595    2022    18       New York Giants     0      0    Philadelphia Eagles   
1596    2022    18      Los Angeles Rams     0      0       Seattle Seahawks   
1597    2022    18     Green Bay Packers     0      1          Detroit Lions   

      Point differential  Yard differen

In [995]:
# Now we are going to do some logistic regression modeling
from sklearn.linear_model import LogisticRegression

In [996]:
df.shape

(1598, 9)

In [997]:
df.head()

Unnamed: 0,Season,Week,Team,Win?,Home?,Team Played,Point differential,Yard differential,Turnover differential
0,2020,1,Kansas City Chiefs,1,1,Houston Texans,14,9,1
1,2020,1,Seattle Seahawks,1,0,Atlanta Falcons,3,-123,1
2,2020,1,Buffalo Bills,1,1,New York Jets,10,150,0
3,2020,1,Las Vegas Raiders,1,0,Carolina Panthers,4,-16,0
4,2020,1,Chicago Bears,1,0,Detroit Lions,4,-63,1


In [998]:


season_totals = df.groupby(['Team', 'Season'])[['Point differential', 'Yard differential', 'Turnover differential']].cumsum()

# Add these cumulative sums to the original DataFrame
df[['Season Point Diff Total', 'Season Yard Diff Total', 'Season Turnover Total']] = season_totals

# Calculate total wins per season
df['Season Wins'] = df.groupby(['Team', 'Season'])['Win?'].cumsum()

# Calculate total games played per season per team
df['Games Played'] = df.groupby(['Team', 'Season'])['Week'].cumcount() + 1

# Calculate winning percentage per season
df['Winning Percentage'] = df['Season Wins'] / df['Games Played']

# Print or use the resulting DataFrame 'df' with the updated season totals, wins, and winning percentage
df[df['Team'] == 'Pittsburgh Steelers']


Unnamed: 0,Season,Week,Team,Win?,Home?,Team Played,Point differential,Yard differential,Turnover differential,Season Point Diff Total,Season Yard Diff Total,Season Turnover Total,Season Wins,Games Played,Winning Percentage
14,2020,1,Pittsburgh Steelers,1,0,New York Giants,10,58,1,10,58,1,1,1,1.0
38,2020,2,Pittsburgh Steelers,1,1,Denver Broncos,5,91,0,15,149,1,2,2,1.0
69,2020,3,Pittsburgh Steelers,1,1,Houston Texans,7,127,1,22,276,2,3,3,1.0
132,2020,5,Pittsburgh Steelers,1,1,Philadelphia Eagles,9,31,1,31,307,3,4,4,1.0
159,2020,6,Pittsburgh Steelers,1,1,Cleveland Browns,31,57,2,62,364,5,5,5,1.0
184,2020,7,Pittsburgh Steelers,1,0,Tennessee Titans,3,70,-3,65,434,2,6,6,1.0
214,2020,8,Pittsburgh Steelers,1,0,Baltimore Ravens,4,-236,3,69,198,5,7,7,1.0
249,2020,9,Pittsburgh Steelers,1,0,Dallas Cowboys,5,-9,2,74,189,7,8,8,1.0
275,2020,10,Pittsburgh Steelers,1,1,Cincinnati Bengals,26,53,2,100,242,9,9,9,1.0
299,2020,11,Pittsburgh Steelers,1,0,Jacksonville Jaguars,24,167,3,124,409,12,10,10,1.0


In [999]:
print(df.columns)

Index(['Season', 'Week', 'Team', 'Win?', 'Home?', 'Team Played',
       'Point differential', 'Yard differential', 'Turnover differential',
       'Season Point Diff Total', 'Season Yard Diff Total',
       'Season Turnover Total', 'Season Wins', 'Games Played',
       'Winning Percentage'],
      dtype='object')


In [1000]:
prev_season_stats_selected = df[['Team','Season Point Diff Total', 'Season Yard Diff Total','Season Turnover Total', 'Winning Percentage']].groupby('Team').shift(1)

# Rename the columns to represent previous season statistics
prev_season_stats_selected.columns = [f'Prev_{col}' for col in prev_season_stats_selected.columns]

# Merge the selected previous season statistics to the original DataFrame based on 'Team' and 'Season'
df = pd.concat([df, prev_season_stats_selected], axis=1)

# Display or use the merged DataFrame as needed
df[df['Team']== 'Washington Commanders']



Unnamed: 0,Season,Week,Team,Win?,Home?,Team Played,Point differential,Yard differential,Turnover differential,Season Point Diff Total,Season Yard Diff Total,Season Turnover Total,Season Wins,Games Played,Winning Percentage,Prev_Season Point Diff Total,Prev_Season Yard Diff Total,Prev_Season Turnover Total,Prev_Winning Percentage
9,2020,1,Washington Commanders,1,1,Philadelphia Eagles,10,-26,3,10,-26,3,1,1,1.0,,,,
59,2020,2,Washington Commanders,0,0,Arizona Cardinals,-15,-122,-1,-5,-148,2,1,2,0.5,10.0,-26.0,3.0,1.0
84,2020,3,Washington Commanders,0,0,Cleveland Browns,-14,9,-5,-19,-139,-3,1,3,0.333333,-5.0,-148.0,2.0,0.5
119,2020,4,Washington Commanders,0,1,Baltimore Ravens,-14,-7,1,-33,-146,-2,1,4,0.25,-19.0,-139.0,-3.0,0.333333
147,2020,5,Washington Commanders,0,1,Los Angeles Rams,-20,-321,1,-53,-467,-1,1,5,0.2,-33.0,-146.0,-2.0,0.25
175,2020,6,Washington Commanders,0,0,New York Giants,-1,97,-1,-54,-370,-2,1,6,0.166667,-53.0,-467.0,-1.0,0.2
189,2020,7,Washington Commanders,1,1,Dallas Cowboys,22,255,1,-32,-115,-1,2,7,0.285714,-54.0,-370.0,-2.0,0.166667
260,2020,9,Washington Commanders,0,1,New York Giants,-3,52,-5,-35,-63,-6,2,8,0.25,-32.0,-115.0,-1.0,0.285714
282,2020,10,Washington Commanders,0,0,Detroit Lions,-3,92,-1,-38,29,-7,2,9,0.222222,-35.0,-63.0,-6.0,0.25
296,2020,11,Washington Commanders,1,1,Cincinnati Bengals,11,53,1,-27,82,-6,3,10,0.3,-38.0,29.0,-7.0,0.222222


In [1001]:
df.columns

Index(['Season', 'Week', 'Team', 'Win?', 'Home?', 'Team Played',
       'Point differential', 'Yard differential', 'Turnover differential',
       'Season Point Diff Total', 'Season Yard Diff Total',
       'Season Turnover Total', 'Season Wins', 'Games Played',
       'Winning Percentage', 'Prev_Season Point Diff Total',
       'Prev_Season Yard Diff Total', 'Prev_Season Turnover Total',
       'Prev_Winning Percentage'],
      dtype='object')

In [1002]:
#current_year_stats = df[['Team', 'Winning Percentage', 'Season Point Diff Total', 'Season Yard Diff Total','Season Turnover Total',]].copy()
#previous_year_stats = df[['Team', 'Prev_Season Point Diff Total','Prev_Season Yard Diff Total', 'Prev_Season Turnover Total','Prev_Winning Percentage']].copy()

In [1003]:
#current_year_stats = current_year_stats[['Team', 'Winning Percentage', 'Season Point Diff Total', 'Season Yard Diff Total', 'Season Turnover Total']]
#previous_year_stats = previous_year_stats[['Team', 'Prev_Season Point Diff Total', 'Prev_Season Yard Diff Total', 'Prev_Season Turnover Total', 'Prev_Winning Percentage']]

# Merging data into the main DataFrame ('df') based on the 'Team Played' column
#df = df.merge(current_year_stats, left_on='Team Played', right_on='Team', how='left')
#df = df.merge(previous_year_stats, left_on='Team Played', right_on='Team', how='left', suffixes=('_current', '_previous'))

# Drop redundant 'Team' columns
#df = df.drop(['Team_current', 'Team_previous'], axis=1)

In [1004]:
df.head()

Unnamed: 0,Season,Week,Team,Win?,Home?,Team Played,Point differential,Yard differential,Turnover differential,Season Point Diff Total,Season Yard Diff Total,Season Turnover Total,Season Wins,Games Played,Winning Percentage,Prev_Season Point Diff Total,Prev_Season Yard Diff Total,Prev_Season Turnover Total,Prev_Winning Percentage
0,2020,1,Kansas City Chiefs,1,1,Houston Texans,14,9,1,14,9,1,1,1,1.0,,,,
1,2020,1,Seattle Seahawks,1,0,Atlanta Falcons,3,-123,1,3,-123,1,1,1,1.0,,,,
2,2020,1,Buffalo Bills,1,1,New York Jets,10,150,0,10,150,0,1,1,1.0,,,,
3,2020,1,Las Vegas Raiders,1,0,Carolina Panthers,4,-16,0,4,-16,0,1,1,1.0,,,,
4,2020,1,Chicago Bears,1,0,Detroit Lions,4,-63,1,4,-63,1,1,1,1.0,,,,


In [1005]:
from sklearn.model_selection import train_test_split

In [1006]:
df.columns

Index(['Season', 'Week', 'Team', 'Win?', 'Home?', 'Team Played',
       'Point differential', 'Yard differential', 'Turnover differential',
       'Season Point Diff Total', 'Season Yard Diff Total',
       'Season Turnover Total', 'Season Wins', 'Games Played',
       'Winning Percentage', 'Prev_Season Point Diff Total',
       'Prev_Season Yard Diff Total', 'Prev_Season Turnover Total',
       'Prev_Winning Percentage'],
      dtype='object')

In [1007]:
df.dropna(inplace=True)

In [1008]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

features1 = ['Prev_Winning Percentage', 'Winning Percentage', 'Prev_Season Point Diff Total',
             'Prev_Season Yard Diff Total', 'Prev_Season Turnover Total', 'Home?']
features2 = ['Winning Percentage', 'Season Point Diff Total', 'Season Yard Diff Total',
             'Season Turnover Total', 'Home?']
target = 'Win?'

split_week = 4
historical_data = df[df['Week'] < split_week ]
current_season_data = df[df['Week'] >= split_week]

if df['Week'].max() < split_week:
   x_train, x_test, y_train, y_test = train_test_split(historical_data[features1], historical_data[target], test_size=0.25, random_state=16)
else:
    x_train, x_test, y_train, y_test = train_test_split(current_season_data[features2], current_season_data[target], test_size=0.25, random_state=16)



logreg = LogisticRegression(random_state=16, max_iter = 1000)

logreg.fit(x_train, y_train)

y_pred = logreg.predict(x_test)
print(y_pred)
print(y_test)
len(y_pred)

[0 1 0 1 0 1 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 1 0 0 1 0 0 0
 1 1 0 1 0 0 1 0 0 0 1 1 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 1 1 1 0 1 0 1 1 0
 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 0 0 0 1 1 1 1 0 0
 1 1 1 0 1 0 1 1 0 0 1 1 1 1 0 1 1 1 0 1 0 1 1 0 0 0 0 1 1 0 0 1 0 1 0 0 1
 0 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
 1 0 1 0 1 0 1 0 0 1 1 0 1 1 0 1 1 0 0 0 0 1 1 1 1 0 0 1 1 0 1 0 1 1 0 1 0
 1 0 1 1 0 0 1 0 0 0 1 1 0 1 0 1 0 1 0 1 1 0 1 1 1 1 0 0 0 1 1 1 0 0 0 1 1
 1 1 0 0 1 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 1 1 1
 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 1 0 1 0 0 1 1 0 0 0 1]
921     0
1396    1
340     0
384     1
894     0
       ..
1299    0
385     1
1220    1
1260    0
1481    1
Name: Win?, Length: 328, dtype: int64


328

In [1009]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.7073170731707317


In [1010]:
from sklearn.metrics import classification_report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.78      0.72       158
           1       0.76      0.64      0.69       170

    accuracy                           0.71       328
   macro avg       0.71      0.71      0.71       328
weighted avg       0.71      0.71      0.71       328



In [1011]:
from sklearn.metrics import confusion_matrix
# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[123  35]
 [ 61 109]]


In [1012]:
new_data = pd.read_excel(r'C:\Users\bryce\OneDrive\Documents\NewDataPyProj.xlsx')
new_data.head()

Unnamed: 0,Season,Week,Team,Win?,Home?,Team Played,Point differential,Yard differential,Turnover differential
0,2023,1,Kansas City Chiefs,,1,Detroit Lions,,,
1,2023,1,Carolina Panthers,,0,Atlanta Falcons,,,
2,2023,1,Cincinnati Bengals,,0,Cleveland Browns,,,
3,2023,1,Indianapolis Colts,,1,Jacksonville Jaguars,,,
4,2023,1,Arizona Cardinals,,0,Washington Commanders,,,


In [1013]:
df[df['Team'] == 'Pittsburgh Steelers']

Unnamed: 0,Season,Week,Team,Win?,Home?,Team Played,Point differential,Yard differential,Turnover differential,Season Point Diff Total,Season Yard Diff Total,Season Turnover Total,Season Wins,Games Played,Winning Percentage,Prev_Season Point Diff Total,Prev_Season Yard Diff Total,Prev_Season Turnover Total,Prev_Winning Percentage
38,2020,2,Pittsburgh Steelers,1,1,Denver Broncos,5,91,0,15,149,1,2,2,1.0,10.0,58.0,1.0,1.0
69,2020,3,Pittsburgh Steelers,1,1,Houston Texans,7,127,1,22,276,2,3,3,1.0,15.0,149.0,1.0,1.0
132,2020,5,Pittsburgh Steelers,1,1,Philadelphia Eagles,9,31,1,31,307,3,4,4,1.0,22.0,276.0,2.0,1.0
159,2020,6,Pittsburgh Steelers,1,1,Cleveland Browns,31,57,2,62,364,5,5,5,1.0,31.0,307.0,3.0,1.0
184,2020,7,Pittsburgh Steelers,1,0,Tennessee Titans,3,70,-3,65,434,2,6,6,1.0,62.0,364.0,5.0,1.0
214,2020,8,Pittsburgh Steelers,1,0,Baltimore Ravens,4,-236,3,69,198,5,7,7,1.0,65.0,434.0,2.0,1.0
249,2020,9,Pittsburgh Steelers,1,0,Dallas Cowboys,5,-9,2,74,189,7,8,8,1.0,69.0,198.0,5.0,1.0
275,2020,10,Pittsburgh Steelers,1,1,Cincinnati Bengals,26,53,2,100,242,9,9,9,1.0,74.0,189.0,7.0,1.0
299,2020,11,Pittsburgh Steelers,1,0,Jacksonville Jaguars,24,167,3,124,409,12,10,10,1.0,100.0,242.0,9.0,1.0
337,2020,12,Pittsburgh Steelers,1,1,Baltimore Ravens,5,115,0,129,524,12,11,11,1.0,124.0,409.0,12.0,1.0


In [1014]:
df = pd.merge(df, new_data, on= ['Season', 'Week', 'Team', 'Home?', 'Win?', 'Team Played', 'Point differential', 'Yard differential', 'Turnover differential'], how= 'outer')


In [1015]:
df.tail()

Unnamed: 0,Season,Week,Team,Win?,Home?,Team Played,Point differential,Yard differential,Turnover differential,Season Point Diff Total,Season Yard Diff Total,Season Turnover Total,Season Wins,Games Played,Winning Percentage,Prev_Season Point Diff Total,Prev_Season Yard Diff Total,Prev_Season Turnover Total,Prev_Winning Percentage
1593,2023,1,Miami Dolphins,,0,Los Angeles Chargers,,,,,,,,,,,,,
1594,2023,1,Philadelphia Eagles,,0,New England Patriots,,,,,,,,,,,,,
1595,2023,1,Los Angeles Rams,,0,Seattle Seahawks,,,,,,,,,,,,,
1596,2023,1,Dallas Cowboys,,0,New York Giants,,,,,,,,,,,,,
1597,2023,1,New York Jets,,1,Buffalo Bills,,,,,,,,,,,,,
