# MIAA Football Power Rankings
## by bg

In [67]:
# import packages for scraping
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Ridge Regression we'll run
from sklearn.linear_model import RidgeCV

# set seed for reproducibility
from numpy.random import seed
seed(42)

# disable set with copy warning cause I'm bad
pd.options.mode.chained_assignment = None

In [68]:
# get date range between start of season august 27th and today use pd.date_range
season_start = '2021-08-27'
season_end = '2021-12-15'
date_range = pd.date_range(start=season_start, end=season_end)

# create a list, and for each day use string formatting to format the date as DD/MM/YYYY
date_list = []
for date in date_range:
    date_list.append(date.strftime('%m/%d/%Y'))

In [69]:
# scrapes maxpreps scores for every day in the range. Puts a dictionary of each game into a list
# takes anywhere from 30 seconds to 2.5 minutes to run
game_list = []
for date in date_list:
    url = 'https://www.maxpreps.com/list/schedules_scores.aspx?date=' + date + '&gendersport=boys,football&state=ma'
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        data = soup.find_all("ul", {"class": "teams"})

        for i in range(len(data)):
            try:
                away_team = data[i].find_all("li")[0].find("div", {"class" : "name"}).text
                home_team = data[i].find_all("li")[1].find("div", {"class" : "name"}).text

                away_score = data[i].find_all("li")[0].find("div", {"class" : "score"}).text
                home_score = data[i].find_all("li")[1].find("div", {"class" : "score"}).text

                game_list.append({'away_name': away_team, 'home_name': home_team, 'away_score': away_score, 'home_score': home_score, "date" : date})
            except:
                    continue
    except:
        continue    

In [70]:
# putting games into a DF
games = pd.DataFrame(game_list)

# drop any game where home or away name contains '(#'. These teams do not play in Massachusetts
games = games[~games['away_name'].str.contains('\(#')]
games = games[~games['home_name'].str.contains('\(#')]
games.drop_duplicates(subset=['away_name', 'home_name', 'date'], keep='last', inplace=True)

# show gmes where home or away name contains a letter
games = games[(~games['home_score'].str.contains('\D')) & (~games['away_score'].str.contains('\D'))]

# convert scores to ints
games['away_score'] = games['away_score'].astype(int)
games['home_score'] = games['home_score'].astype(int)

# drop games before thanksgiving, as we want to predict the Super Bowls which are played the week after 
games = games[games['date'] < '11/24/2021']

# if date is after november 18th, make the field 'playoff' = 1
# in the regression model, this should capture the game was played on a neutral site
games['playoff'] = games['date'].apply(lambda x: 1 if x > '11/18/2021' else 0)

games.reset_index(drop=True, inplace=True)
games

Unnamed: 0,away_name,home_name,away_score,home_score,date,playoff
0,Taunton,Stoughton,20,13,08/28/2021,0
1,Bourne,Bristol-Plymouth RVT,26,18,08/28/2021,0
2,Bridgewater-Raynham,Xaverian Brothers,15,27,08/28/2021,0
3,Lynn English,Marblehead,14,28,08/28/2021,0
4,Quincy,Medford,33,32,08/28/2021,0
...,...,...,...,...,...,...
1387,North Attleborough,Billerica Memorial,28,10,11/20/2021,1
1388,Foxborough,Duxbury,21,38,11/20/2021,1
1389,Rockland,Blackstone Valley RVT,20,17,11/20/2021,1
1390,Duxbury,Foxborough,38,21,11/20/2021,1


In [71]:
# get list of all teams
team_count = games['home_name'].unique()
home_team = team_count.tolist()

team_count = games['away_name'].unique()
away_team = team_count.tolist()

# merge home and away teams, and drop duplicates
team_list = home_team + away_team
team_list = list(set(team_list))

# count all times a team in team_list appears in games['home_team']
team_dict = {team : 0 for team in team_list}

for team in team_list:
    team_dict[team] += games[games['home_name'] == team].count()[0]
    team_dict[team] += games[games['away_name'] == team].count()[0]


# put teams who appear in <= 3 games into a list
drop_list = []
for team in team_dict:
    if team_dict[team] <= 3:
        drop_list.append(team)

drop_list

games = games[(~games['home_name'].isin(drop_list)) & (~games['away_name'].isin(drop_list))]

# remove games including these teams
games.reset_index(drop=True, inplace=True)

In [72]:
team_srs = []

# Given the size of the DF, I will use iterrows. However, on larger scales it is smarter to use a vectorized approach
for idx, row in games.iterrows():
    margin = row['home_score'] - row['away_score']
    team_srs.append({row['home_name'] : 1, row['away_name'] : -1, 'margin' : margin, 'playoffs' : row['playoff']})

srs_df = pd.DataFrame(team_srs).fillna(0)
srs_df.head()

Unnamed: 0,Stoughton,Taunton,margin,playoffs,Bristol-Plymouth RVT,Bourne,Xaverian Brothers,Bridgewater-Raynham,Marblehead,Lynn English,...,Roxbury Prep Charter,Athol,Cathedral/Matignon,Holbrook,Tech Boston Academy,Boston English,Blackstone Valley RVT,Drury/Mount Greylock Regional,Excel,Catholic Memorial
0,1.0,-1.0,-7,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,-8,0,1.0,-1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,12,0,0.0,0.0,1.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,14,0,0.0,0.0,0.0,0.0,1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,-1,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Using teams as our input, we will predict the margin of each game
#### Resulting coefficients will be used to determine the strength of each team 

In [73]:
X = srs_df.drop(['margin'], axis=1)
y = srs_df['margin']

# going to be trying various alphas from 1E-5 to 1E5
lm = RidgeCV(alphas=[10 ** i for i in range(-5,5)]).fit(X, y)
lm.alpha_

0.01

In [74]:
# get coefficients and their name
coefs = pd.DataFrame(lm.coef_, X.columns, columns=['Strength'])
coefs = coefs.sort_values('Strength', ascending=False).reset_index()
coefs.head(10)

Unnamed: 0,index,Strength
0,Catholic Memorial,70.063126
1,Central Catholic,51.473724
2,Central,51.094908
3,Franklin,50.118266
4,Lincoln-Sudbury,48.362833
5,Duxbury,48.021034
6,Milford,45.882703
7,St. John's Prep,45.111687
8,Xaverian Brothers,44.406222
9,King Philip Regional,44.056377


In [75]:
def power_rank(teams, df):
    '''
    teams is a list of teams
    df is a dataframe of the regression coefficients
    returns the df filtered to only include desired teams teams
    '''
    return df[df['index'].isin(teams)].sort_values('Strength', ascending=False).reset_index(drop=True)

In [76]:
power_rank(['Catholic Memorial', 'King Philip Regional'] , coefs)

Unnamed: 0,index,Strength
0,Catholic Memorial,70.063126
1,King Philip Regional,44.056377


### Catholic Memorial is far above the next best team.
### Using pre-Super Bowl data, we see they have a coeficient advantage of 26 over King Philips Regional.
### In the Super Bowl, they beat King Philip by 24, inline with expectations from this analysis

In [77]:
power_rank(['Rockland', 'Abington'] , coefs)

Unnamed: 0,index,Strength
0,Abington,23.806526
1,Rockland,17.55732


### Rockland went on to win this matchup. I can't win em all, but the Bulldogs sure can!