In [1]:
"""
Python notebook to extract data from Torvik NCAA mens basketball. 

URL:barttorvik.com/

"""

'\nPython notebook to extract data from Torvik NCAA mens basketball. \n\nURL:barttorvik.com/\n\n'

In [6]:
import numpy as np
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup

*Function to get table data with an input venue*

To use, these need to be in the form "barttorvik.com/?venue={value}#"
https://www.barttorvik.com/trank.php?year=2023&sort=&hteam=&t2value=&conlimit=All&state=All&begin=20221101&end=20230516&top=0&revquad=0&quad=5&venue=All&type=All&mingames=0#

In [3]:
def get_torvik_data(venue,year,enddate):
    # Replace this URL with the actual one
    startyear = year-1
    url = f"https://barttorvik.com/?venue={venue}&year={year}&begin={startyear}1101&end={enddate}#"  
    print(url)
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch data for venue: {venue} and year: {year}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table')  # Adjust based on the table's class or id
    if table is None:
        print(f"No table found for venue: {venue}")
        return None

    # Extract all header rows (<thead>)
    header_rows = table.find('thead').find_all('tr', class_=lambda x: x != 'toprow')
        
    # Merge headers row-by-row
    headers = []
    for header_row in header_rows:
        current_row = [th.text.strip() for th in header_row.find_all('th', class_=lambda x: x != 'toprow')]
            # Extend headers to align with multi-row structure
        if len(headers) > 0:
            headers = [f"{h} | {c}" if h else c for h, c in zip(headers, current_row + [""] * (len(headers) - len(current_row)))]
        else:
            headers = current_row

    # Extract table body rows (<tbody>)
    body_rows = table.find('tbody').find_all('tr')
    data = []
    for body_row in body_rows:
        row_data = [td.text.strip() for td in body_row.find_all('td')]
    # Skip empty rows (rows that don't have any data)
        if not any(row_data):  # If the row is empty, skip it
            continue
        # Find the index of the "Team" column
        team_index = headers.index("Team")
        team_name = row_data[team_index]
        
        # Use regex to clean the team name: remove text after "vs."
        team_name = re.sub(r'(\s+vs\..*)', '', team_name).strip()  # Remove " vs." and anything after it

        team_name = re.sub(r'(\s*\((H|A)\)\s*.*)', '', team_name).strip()  # Remove text after "(H)" or "(A)"


        row_data[team_index] = team_name  # Update the team name in the row
        
        data.append(row_data)

    # Create DataFrame
    df = pd.DataFrame(data, columns=headers)

    # Ensure 'Team' column exists
    if "Team" not in df.columns:
        print("No 'Team' column found in the data.")
        return None

    # Convert 'Team' column to strings and handle missing data
    df['Team'] = df['Team'].astype(str).fillna("")

    # Extract and clean team names
    df['Team'] = (
        df['Team']
        .str.extract(r'([A-Za-z\s.&]+)'))  # Extract valid team names
    #     .fillna("")                      # Handle cases where regex extraction fails
    #     .str.strip()                     # Remove extra spaces
    #     .str.title()                     # Standardize capitalization
    # )
    return df

In [4]:
#dictionary to hold dates for selection sunday

from datetime import date
today = date.today()
today = today.strftime("%Y%m%d")  # Format the date without hyphen
#these are actually the monday dates in format YYYYMMDD
selection_sunday_dates = {2015: 20150316, 
                          2016: 20160314, 
                          2017: 20170313,
                          2018: 20180312,
                          2019: 20190318, 
                          2020: 20200316, 
                          2021: 20210315, 
                          2022: 20220314, 
                          2023: 20230313, 
                          2024: 20240314,
                          2025: today}


In [7]:
#What years are we looking for:
myseasons = list(range(2015,2025+1)) #must use +1 to make sure 25 is included

#Getting dataframes for torvik data
tvk_H_dict = {}
tvk_A_N_dict = {}
for season in myseasons:
    #PULL DATA
    tvk_data_H = get_torvik_data('H', year = season, enddate = selection_sunday_dates[season] ) 
    tvk_data_A_N = get_torvik_data('A-N', year = season, enddate = selection_sunday_dates[season] ) 

    #Append to dictionaries
    tvk_H_dict[str(season)] = tvk_data_H
    tvk_A_N_dict[str(season)] = tvk_data_A_N

https://barttorvik.com/?venue=H&year=2015&begin=20141101&end=20150316#
https://barttorvik.com/?venue=A-N&year=2015&begin=20141101&end=20150316#
https://barttorvik.com/?venue=H&year=2016&begin=20151101&end=20160314#
https://barttorvik.com/?venue=A-N&year=2016&begin=20151101&end=20160314#
https://barttorvik.com/?venue=H&year=2017&begin=20161101&end=20170313#
https://barttorvik.com/?venue=A-N&year=2017&begin=20161101&end=20170313#
https://barttorvik.com/?venue=H&year=2018&begin=20171101&end=20180312#
https://barttorvik.com/?venue=A-N&year=2018&begin=20171101&end=20180312#
https://barttorvik.com/?venue=H&year=2019&begin=20181101&end=20190318#
https://barttorvik.com/?venue=A-N&year=2019&begin=20181101&end=20190318#
https://barttorvik.com/?venue=H&year=2020&begin=20191101&end=20200316#
https://barttorvik.com/?venue=A-N&year=2020&begin=20191101&end=20200316#
https://barttorvik.com/?venue=H&year=2021&begin=20201101&end=20210315#
https://barttorvik.com/?venue=A-N&year=2021&begin=20201101&end=20

In [None]:
#tvk_venue_H = get_torvik_data('H',year=2018, enddate = selection_sunday_dates[2018])
#tvk_venue_A_N = get_torvik_data('A-N',myyear)

https://barttorvik.com/?venue=H&year=2018&begin=20171101&end=20180312#


In [15]:
tvk_H_dict['2021']

Unnamed: 0,Rk,Team,Conf,G,Rec,AdjOE,AdjDE,Barthag,EFG%,EFGD%,...,FTR,FTRD,2P%,2P%D,3P%,3P%D,3PR,3PRD,Adj T.,WAB
0,1,Gonzaga,WCC,12,12–08–0,124.03,88.211,.98041,60.46,46.582,...,39.146,25.362,63.63,46.194,35.9113,31.6113,33.0280,31.845,74.713,+2.019
1,2,Houston,Amer,14,14–09–0,118.49,85.22,.97782,51.4152,41.46,...,31.5190,43.9334,49.5219,42.014,36.0110,27.024,42.553,36.6157,66.6281,+2.810
2,3,Baylor,B12,11,11–06–0,125.22,92.449,.97053,59.410,47.5117,...,28.8252,28.7125,53.788,45.065,45.42,34.8228,39.7124,34.295,70.6107,+2.616
3,4,Colorado,P12,12,11–19–1,119.97,89.122,.96834,56.630,45.549,...,28.7255,27.7103,51.9126,42.219,43.35,34.9232,35.8229,32.151,68.4212,+3.16
4,5,Michigan,B10,14,13–18–1,116.612,87.97,.96295,56.434,42.18,...,28.9250,24.348,54.662,40.36,40.225,30.685,32.1296,33.170,68.4212,+3.91
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340,341,Idaho,BSky,10,1–91–9,92.2304,115.1338,.0719341,49.2250,56.6340,...,30.8211,29.1131,48.6253,53.2290,33.7205,41.6340,31.5309,36.7159,66.1300,-8.2341
341,342,South Carolina St.,MEAC,7,1–61–4,88.8332,111.0313,.0712342,47.4291,50.1206,...,30.4216,38.0305,43.2336,55.6325,36.598,24.18,36.5208,28.010,72.058,-5.8321
342,343,Alabama St.,SWAC,9,3–63–6,82.1344,107.8279,.0421343,42.2344,46.997,...,40.334,35.8278,40.5343,47.0119,30.4300,31.2100,32.9285,30.632,69.7149,-5.7319
343,344,William & Mary,CAA,4,1–31–2,86.4340,114.5336,.0380344,39.9345,56.3337,...,27.8274,17.02,43.4334,57.4339,22.8345,36.0270,38.8144,32.662,68.1226,-2.7198


In [27]:
tvk_A_N_dict['2025']

Unnamed: 0,Rk,Team,Conf,G,Rec,AdjOE,AdjDE,Barthag,EFG%,EFGD%,...,FTR,FTRD,2P%,2P%D,3P%,3P%D,3PR,3PRD,Adj T.,WAB
0,1,Auburn,SEC,10,9–13–0,137.41,96.845,.98251,55.319,49.580,...,26.9293,39.2255,54.640,46.525,37.636,37.7311,40.3137,29.24,67.2244,+5.81
1,2,Duke,ACC,7,5–24–0,122.014,89.96,.97122,55.416,46.418,...,28.4254,32.4114,59.16,43.47,33.7130,34.2177,43.577,37.4133,65.9295,+2.210
2,3,Illinois,B10,8,5–33–2,124.63,94.019,.96263,53.143,47.735,...,30.6180,32.3109,54.738,49.368,34.1115,29.225,43.675,29.24,72.921,+2.016
3,4,Tennessee,SEC,8,6–21–2,115.642,88.43,.95634,50.0138,43.04,...,35.187,27.537,46.0277,44.911,36.848,27.19,43.873,43.6293,64.3342,+2.85
4,5,Saint Mary,WCC,8,6–23–0,119.322,91.411,.95525,51.877,42.83,...,26.2304,29.259,47.3228,43.88,40.75,27.08,32.4311,28.73,63.0360,+0.832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,360,Northern Illinois,MAC,9,0–90–2,90.8355,116.3341,.0550360,42.9346,59.5359,...,31.2166,39.4257,43.0345,57.1310,28.5323,42.2362,48.224,39.2184,71.938,-6.8330
360,361,Arkansas Pine Bluff,SWAC,13,0–130–3,99.5277,128.2364,.0516361,51.0100,62.6363,...,30.1197,41.6292,55.430,66.4364,29.5291,38.8335,39.5163,46.9340,74.57,-8.9362
361,362,Florida A&M,SWAC,10,0–100–2,95.2333,125.0362,.0420362,45.1317,57.9344,...,33.6119,38.7244,43.6337,58.2328,31.2242,38.3330,46.148,39.6192,70.0103,-4.7269
362,363,Coppin St.,MEAC,13,0–130–2,88.2359,117.4349,.0361363,41.1357,60.8360,...,32.7142,36.0188,41.4355,62.6359,26.9346,39.1346,29.3350,45.3320,68.9154,-8.7360
