In [1]:
import os
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from pathlib import Path

In [2]:
round_name_to_index = {'First Four': 0, 'First Round (Round of 64)': 1, 'Second Round (Round of 32)': 2,
                       'Sweet 16': 3, 'Elite Eight': 4, 'National semifinals': 5, 'National championship': 6}

# 2022 Tournament Results

In [3]:
url = "https://www.ncaa.com/news/basketball-men/article/2022-07-12/2022-ncaa-bracket-mens-march-madness-scores-stats-records"
response = requests.get(url)

In [4]:
soup = BeautifulSoup(response.content, 'html.parser')

In [5]:
scores_2022 = soup.find("h2", string="2022 NCAA tournament scores").next_sibling.next_sibling

In [6]:
results_2022 = {'YEAR': [], 'ROUND': [], 'WSEED': [], 'WTEAM': [], 'WSCORE': [], 'LSEED': [], 'LTEAM': [], 'LSCORE': []}
for round_details in scores_2022.children:
    if isinstance(round_details, NavigableString):
        pass
    else:
        round_detail_elements = round_details.children
        
        round_name = next(round_detail_elements).strip()
        
        round_scores_by_location = next(round_detail_elements)
        for location_list_item in round_scores_by_location.children:
            if isinstance(location_list_item, NavigableString):
                pass
            else:
                location_list_sub_items = location_list_item.children
                round_location = next(location_list_sub_items).strip()
                if round_location in ['National semifinals', 'National championship']:
                    round_name = round_location
                
                location_scores = next(location_list_sub_items)
                for matchup in location_scores.children:
                    if isinstance(matchup, NavigableString):
                        pass
                    else:
                        matchup_text = matchup.text
                        
                        ### Parse Winning Team Info ###
                        
                        # Parse the seed of the first team listed
                        match = re.search(r'(?<=No. )\d{1,2}', matchup_text)
                        winning_seed = match[0]
                        matchup_text = matchup_text[match.end()+1:]
                        
                        # Parse the name of the first team listed
                        match = re.search(r'^\D+(?=\s\d{2,3})', matchup_text)
                        winning_team = match[0]
                        matchup_text = matchup_text[match.end()+1:]
                        
                        # Parse the score of the first team listed
                        match = re.search(r'^\d{2,3}(?=,\s)', matchup_text)
                        winning_score = match[0]
                        matchup_text = matchup_text[match.end()+2:]
                        
                        ### Parse Losing Team Info ###
                        
                        # Parse the seed of the second team listed
                        match = re.search(r'((?<=No. )\d{1,2})|(^\d{1,2})', matchup_text)
                        losing_seed = match[0]
                        matchup_text = matchup_text[match.end()+1:]
                        
                        # Parse the name of the second team listed
                        match = re.search(r'^\D+(?=\s\d{2,3})', matchup_text)
                        losing_team = match[0]
                        matchup_text = matchup_text[match.end()+1:]
                        
                        # Parse the score of the second team listed
                        match = re.search(r'^\d{2,3}', matchup_text)
                        losing_score = match[0]
                        
                        results_2022['YEAR'].append(2022)
                        results_2022['ROUND'].append(round_name_to_index[round_name])
                        results_2022['WSEED'].append(int(winning_seed))
                        results_2022['WTEAM'].append(winning_team)
                        results_2022['WSCORE'].append(int(winning_score))
                        results_2022['LSEED'].append(int(losing_seed))
                        results_2022['LTEAM'].append(losing_team)
                        results_2022['LSCORE'].append(int(losing_score))

In [7]:
results_2022 = pd.DataFrame(results_2022)
results_2022

Unnamed: 0,YEAR,ROUND,WSEED,WTEAM,WSCORE,LSEED,LTEAM,LSCORE
0,2022,0,12,Indiana,66,12,Wyoming,58
1,2022,0,16,Wright State,93,16,Bryant,82
2,2022,0,11,Notre Dame,89,11,Rutgers,87
3,2022,0,16,Texas Southern,76,16,Tex. A&M CC,67
4,2022,1,1,Kansas,83,16,Texas Southern,56
...,...,...,...,...,...,...,...,...
62,2022,4,8,North Carolina,69,15,St. Peter’s,49
63,2022,4,2,Villanova,50,5,Houston,44
64,2022,5,8,N. Carolina,81,2,Duke,77
65,2022,5,1,Kansas,81,2,Villanova,65


# 2023 Tournament Results

In [8]:
url = 'https://www.ncaa.com/news/basketball-men/article/2023-04-18/2023-ncaa-bracket-scores-stats-march-madness-mens-tournament'
response = requests.get(url)

In [9]:
soup = BeautifulSoup(response.content, 'html.parser')

In [10]:
scores_2023 = soup.find("h3", string="2023 NCAA tournament scores").next_sibling.next_sibling

In [11]:
results_2023 = {'YEAR': [], 'ROUND': [], 'WSEED': [], 'WTEAM': [], 'WSCORE': [], 'LSEED': [], 'LTEAM': [], 'LSCORE': []}
for round_details in scores_2023.children:
    if isinstance(round_details, NavigableString):
        pass
    else:
        round_detail_elements = round_details.children
        
        round_name = next(round_detail_elements).strip()
        
        round_scores_by_location = next(round_detail_elements)
        for location_list_item in round_scores_by_location.children:
            if isinstance(location_list_item, NavigableString):
                pass
            else:
                location_list_sub_items = location_list_item.children
                round_location = next(location_list_sub_items).strip()
                if round_location == 'National semifinals':
                    round_name = round_location
                
                if round_name == 'National championship':
                    matchup_text = round_location

                    ### Parse Winning Team Info ###

                    # Parse the seed of the first team listed
                    match = re.search(r'(?<=No. )\d{1,2}', matchup_text)
                    winning_seed = match[0]
                    matchup_text = matchup_text[match.end()+1:]

                    # Parse the name of the first team listed
                    match = re.search(r'^\D+(?=\s\d{2,3})', matchup_text)
                    winning_team = match[0]
                    matchup_text = matchup_text[match.end()+1:]

                    # Parse the score of the first team listed
                    match = re.search(r'^\d{2,3}(?=,\s)', matchup_text)
                    winning_score = match[0]
                    matchup_text = matchup_text[match.end()+2:]

                    ### Parse Losing Team Info ###

                    # Parse the seed of the second team listed
                    match = re.search(r'((?<=No. )\d{1,2})|(^\d{1,2})', matchup_text)
                    losing_seed = match[0]
                    matchup_text = matchup_text[match.end()+1:]

                    # Parse the name of the second team listed
                    match = re.search(r'(^\D+(?=\s\d{2,3}))|(^\D+(?=\s\(\d{2,3}\)))', matchup_text)
                    losing_team = match[0]
                    matchup_text = matchup_text[match.end()+1:]

                    # Parse the score of the second team listed
                    match = re.search(r'(^\d{2,3})|(?<=\()\d{2,3}(?=\))', matchup_text)
                    losing_score = match[0]
                    
                    results_2023['YEAR'].append(2023)
                    results_2023['ROUND'].append(round_name_to_index[round_name])
                    results_2023['WSEED'].append(int(winning_seed))
                    results_2023['WTEAM'].append(winning_team)
                    results_2023['WSCORE'].append(int(winning_score))
                    results_2023['LSEED'].append(int(losing_seed))
                    results_2023['LTEAM'].append(losing_team)
                    results_2023['LSCORE'].append(int(losing_score))
                else:
                    location_scores = next(location_list_sub_items)
                    for matchup in location_scores.children:
                        if isinstance(matchup, NavigableString):
                            pass
                        else:
                            matchup_text = matchup.text

                            ### Parse Winning Team Info ###

                            # Parse the seed of the first team listed
                            match = re.search(r'(?<=No. )\d{1,2}', matchup_text)
                            winning_seed = match[0]
                            matchup_text = matchup_text[match.end()+1:]

                            # Parse the name of the first team listed
                            match = re.search(r'^\D+(?=\s\d{2,3})', matchup_text)
                            winning_team = match[0]
                            matchup_text = matchup_text[match.end()+1:]

                            # Parse the score of the first team listed
                            match = re.search(r'^\d{2,3}(?=,\s)', matchup_text)
                            winning_score = match[0]
                            matchup_text = matchup_text[match.end()+2:]

                            ### Parse Losing Team Info ###

                            # Parse the seed of the second team listed
                            match = re.search(r'((?<=No. )\d{1,2})|(^\d{1,2})', matchup_text)
                            losing_seed = match[0]
                            matchup_text = matchup_text[match.end()+1:]

                            # Parse the name of the second team listed
                            match = re.search(r'(^\D+(?=\s\d{2,3}))|(^\D+(?=\s\(\d{2,3}\)))', matchup_text)
                            losing_team = match[0]
                            matchup_text = matchup_text[match.end()+1:]

                            # Parse the score of the second team listed
                            match = re.search(r'(^\d{2,3})|(?<=\()\d{2,3}(?=\))', matchup_text)
                            losing_score = match[0]
                            
                            results_2023['YEAR'].append(2023)
                            results_2023['ROUND'].append(round_name_to_index[round_name])
                            results_2023['WSEED'].append(int(winning_seed))
                            results_2023['WTEAM'].append(winning_team)
                            results_2023['WSCORE'].append(int(winning_score))
                            results_2023['LSEED'].append(int(losing_seed))
                            results_2023['LTEAM'].append(losing_team)
                            results_2023['LSCORE'].append(int(losing_score))

In [12]:
results_2023 = pd.DataFrame(results_2023)
results_2023

Unnamed: 0,YEAR,ROUND,WSEED,WTEAM,WSCORE,LSEED,LTEAM,LSCORE
0,2023,0,16,FDU,84,16,Texas Southern,61
1,2023,0,11,Arizona St.,98,11,Nevada,73
2,2023,0,16,Texas A&M - CC,75,16,SE Missouri St.,71
3,2023,0,11,Pittsburgh,60,11,Mississippi St.,59
4,2023,1,1,Houston,63,16,North Kentucky,52
...,...,...,...,...,...,...,...,...
62,2023,4,9,FAU,79,3,Kansas St.,76
63,2023,4,5,San Diego St.,57,6,Creighton,56
64,2023,5,5,San Diego St.,72,9,FAU,71
65,2023,5,4,UConn,72,5,Miami (FL),59


# Write Combined Results to CSV

In [13]:
results_2022_2023 = pd.concat([results_2022, results_2023], ignore_index=True)
results_2022_2023.iloc[::-1]

Unnamed: 0,YEAR,ROUND,WSEED,WTEAM,WSCORE,LSEED,LTEAM,LSCORE
133,2023,6,4,UConn,76,5,San Diego St.,59
132,2023,5,4,UConn,72,5,Miami (FL),59
131,2023,5,5,San Diego St.,72,9,FAU,71
130,2023,4,5,San Diego St.,57,6,Creighton,56
129,2023,4,9,FAU,79,3,Kansas St.,76
...,...,...,...,...,...,...,...,...
4,2022,1,1,Kansas,83,16,Texas Southern,56
3,2022,0,16,Texas Southern,76,16,Tex. A&M CC,67
2,2022,0,11,Notre Dame,89,11,Rutgers,87
1,2022,0,16,Wright State,93,16,Bryant,82


In [36]:
PROJ_PATH = Path(os.getcwd()).parent
DATA_PATH = PROJ_PATH / 'Data'

In [None]:
results_2022_2023.to_csv(DATA_PATH / 'Tournament_Game_Results_22_23.csv', index=False)