In [1]:
from bs4 import BeautifulSoup
import requests # sends requests to a website
import pandas as pd
import matplotlib.pyplot as plt
import random
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
website = 'https://www.teamrankings.com/ncaa-basketball/team/purdue-boilermakers/' # this is the website that I will parse
result = requests.get(website)
context = result.text
soup = BeautifulSoup(context, 'lxml')

In [3]:
 # list of teams to extract data from
teams = ['Purdue', 'Duke', 'Merrimack', 'Norfolk St', 'Oklahoma', 'New Mexico', 'Dayton', 'McNeese St', 'Baylor', 
         'UC Irvine', 'St Marys', 'Indiana St', 'Alabama', 'High Point', 'Wash State', 'Northwestern', 'Marquette', 
        'Morehead St', 'Arizona', 'S Dakota St', 'Texas', 'Boise St', 'Wisconsin', 'Richmond', 'Auburn', 'App State', 
        'BYU', 'Gonzaga', 'Creighton', 'Charl South', 'Florida', 'Nebraska', 'Kansas', 'E Washingtn', 'Connecticut', 'E Kentucky', 
        'Fla Atlantic', 'TX Christian', 'Texas Tech', 'Samford', 'San Diego St', 'Yale', 'Kentucky', 'Grd Canyon', 'Iowa St', 
         'Oakland', 'S Carolina', 'Nevada', 'N Carolina', 'Colgate', 'Houston', 'Sam Hous St', 'Utah St', 
         'Miss State', 'Clemson', 'Providence', 'Illinois', 'Akron', 'Colorado St', 'S Florida', 'Vermont', 
         'Michigan St', 'Virginia', 'Tennessee', 'Quinnipiac']
print(len(teams))
print(sorted(teams))
teams_urls = [] # list will eventually contain all urls to extract data from

65
['Akron', 'Alabama', 'App State', 'Arizona', 'Auburn', 'BYU', 'Baylor', 'Boise St', 'Charl South', 'Clemson', 'Colgate', 'Colorado St', 'Connecticut', 'Creighton', 'Dayton', 'Duke', 'E Kentucky', 'E Washingtn', 'Fla Atlantic', 'Florida', 'Gonzaga', 'Grd Canyon', 'High Point', 'Houston', 'Illinois', 'Indiana St', 'Iowa St', 'Kansas', 'Kentucky', 'Marquette', 'McNeese St', 'Merrimack', 'Michigan St', 'Miss State', 'Morehead St', 'N Carolina', 'Nebraska', 'Nevada', 'New Mexico', 'Norfolk St', 'Northwestern', 'Oakland', 'Oklahoma', 'Providence', 'Purdue', 'Quinnipiac', 'Richmond', 'S Carolina', 'S Dakota St', 'S Florida', 'Sam Hous St', 'Samford', 'San Diego St', 'St Marys', 'TX Christian', 'Tennessee', 'Texas', 'Texas Tech', 'UC Irvine', 'Utah St', 'Vermont', 'Virginia', 'Wash State', 'Wisconsin', 'Yale']


In [4]:
all_teams_dropdown = soup.find_all('select', class_ = 'redirectOnChange')[1] # finds dropdown menu containing all teams
options = all_teams_dropdown.find_all('option') #gets each team from the dropdown menu

teams_in_order = [] # gets teams in the same order that the urls are acquired so that team1 is populated correctly in later code

for option in options:
    if option.text in teams: # if each team exists in the teams array
        # get the end part of the url and store in teams_urls
        teams_in_order.append(option.text)
        url = option['value']
        teams_urls.append(url)
        
all_urls = ['https://www.teamrankings.com/' + url for url in teams_urls]    
print(len(all_urls))
print(len(teams_in_order))
print(teams_in_order)

52
52
['Akron', 'Alabama', 'App State', 'Arizona', 'Auburn', 'Baylor', 'Boise St', 'BYU', 'Clemson', 'Colgate', 'Colorado St', 'Creighton', 'Dayton', 'Duke', 'E Kentucky', 'Florida', 'Gonzaga', 'High Point', 'Houston', 'Illinois', 'Indiana St', 'Iowa St', 'Kansas', 'Kentucky', 'Marquette', 'Merrimack', 'Michigan St', 'Morehead St', 'Nebraska', 'Nevada', 'New Mexico', 'Norfolk St', 'Northwestern', 'Oakland', 'Oklahoma', 'Providence', 'Purdue', 'Quinnipiac', 'Richmond', 'S Dakota St', 'S Florida', 'Samford', 'San Diego St', 'Tennessee', 'Texas', 'Texas Tech', 'UC Irvine', 'Utah St', 'Vermont', 'Virginia', 'Wisconsin', 'Yale']


In [5]:
# initialize empty lists to be populated with data
team1 = []
team2 = []
team1_score = []
team2_score = []
location = []

In [6]:
index = 0 # index for keeping track of which team is being parsed, will be used for team1
for site in all_urls:
    # parse the given site
    result = requests.get(site)
    context = result.text
    soup = BeautifulSoup(context, 'lxml')
    
    # extract data
    
    # get the second table (the one containing game data)
    table = soup.find_all('table')[1]
    rows = table.find_all('tr') # get each row in the table
    del rows[0] #deletes the header

    for row in rows:
        columns = row.find_all('td') # get each column in each row
        
        # randomly assign team1 and team2
        randInt = random.randint(0, 1)
        if randInt == 0:
            team1.append(teams_in_order[index])
            team2.append(columns[1].text)
        else:
            team2.append(teams_in_order[index])
            team1.append(columns[1].text)

        score = columns[2].text #score is in format "W/L score1-score2"
        split_dash = score.split('-')
        split_space = split_dash[0].split(' ')

        # if a game has not yet been played, add scores as None
        if len(split_dash) == 2:
            team1_score.append(int(split_space[1]))
            team2_score.append(int(split_dash[1]))
        else:
            team1_score.append(None)
            team2_score.append(None)

        location.append(columns[3].text)
        
    index += 1
    

In [7]:
df = pd.DataFrame({'team1':team1, 'team2':team2, 'team1_score':team1_score, 'team2_score':team2_score, 'team1_location':location})
print(df)

              team1         team2  team1_score  team2_score team1_location
0       Arkansas St         Akron         75.0         80.0           Away
1     Ohio Wesleyan         Akron         88.0         57.0           Home
2             Akron  Saint Mary's         68.0         87.0           Away
3             Lamar         Akron         79.0         72.0           Home
4             Akron         Omaha         92.0         84.0           Home
...             ...           ...          ...          ...            ...
1601        Cornell          Yale         92.0         88.0           Home
1602       Columbia          Yale         90.0         64.0           Home
1603      Dartmouth          Yale          NaN          NaN           Away
1604           Yale       Harvard          NaN          NaN           Away
1605           Yale         Brown          NaN          NaN           Away

[1606 rows x 5 columns]


In [8]:
# !pip install scikit-learn
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import sklearn.datasets
from pandas import DataFrame

In [9]:
# import team data
data_dump = pd.read_csv('march_madness_data_dump.csv')
data_dump.head()

Unnamed: 0,Rk,School,Win-Loss Percentage,SRS,SOS,Team Points,Opponent Ponts,Minutes Played,FG,FGA,...,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
0,1,Abl Christian,0.519,-5.16,-1.91,1933,1901,1085,697,1538,...,412,582,0.708,289,898,347,292,73,391,553
1,2,Air Force,0.148,-7.22,2.41,1693,1953,1090,579,1366,...,314,495,0.634,204,811,360,154,81,355,478
2,3,Akron,0.778,3.39,-3.81,2256,1992,1085,819,1754,...,323,433,0.746,323,1075,498,207,100,339,495
3,4,Alabama,0.815,25.06,14.65,2452,2171,1085,831,1718,...,515,716,0.719,362,1182,447,160,129,358,512
4,5,Alab A&M,0.333,-19.21,-9.66,2044,2150,1100,698,1708,...,426,640,0.666,400,1029,374,236,113,439,603


In [10]:
# merge the team data with the game data, with the school name being the common column
merge1 = pd.merge(df, data_dump, left_on = 'team1', right_on = 'School', how = 'left')
final = pd.merge(merge1, data_dump, left_on = 'team2', right_on = 'School', how = 'left', suffixes = ('_team1', '_team2'))

# find all rows with null values, either from unplayed games or missing info in the data_dump
bad_rows = final[final.isnull().any(axis=1)]
print(bad_rows[['team1', 'team2', 'team1_score', 'team2_score']])

# remove all rows with null values from final
final = final[~final.isin(bad_rows)].dropna()

# drop unnecessary columns
final = final.drop(columns = ['team1', 'team2', 'Rk_team1', 'Rk_team2', 'School_team1', 'School_team2'])

# final output visualization
print(final.head())
print(final.shape)

              team1           team2  team1_score  team2_score
1     Ohio Wesleyan           Akron         88.0         57.0
2             Akron    Saint Mary's         68.0         87.0
4             Akron           Omaha         92.0         84.0
7             Akron  SUNY-Brockport        101.0         48.0
8             Akron       Milwaukee         81.0        100.0
...             ...             ...          ...          ...
1597           Yale            Penn         90.0         61.0
1599           Penn            Yale         72.0         71.0
1603      Dartmouth            Yale          NaN          NaN
1604           Yale         Harvard          NaN          NaN
1605           Yale           Brown          NaN          NaN

[493 rows x 4 columns]
   team1_score  team2_score team1_location  Win-Loss Percentage_team1  \
0         75.0         80.0           Away                      0.690   
3         79.0         72.0           Home                      0.643   
5         97.

In [11]:
# set the target to the score margin (score1 - score2)
target = final['team1_score'] - final['team2_score']

# drop score columns, as they make up the margin
X = final.drop(columns = ['team1_score', 'team2_score'])
X['margin'] = target
# perform get_dummies on the location column
X = pd.get_dummies(X, columns = ['team1_location'])
print(X.shape)


(1113, 48)


In [12]:
X.to_csv('MarchMadnessData2024.csv', index=False)
