In [1]:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import numpy as np
from unidecode import unidecode

In [2]:
#Verbal Commits as NCAA 2022 Transfer Portal database
url = "https://www.verbalcommits.com/transfers/2022"
r = requests.get(url)
webpage = bs(r.content, features="html.parser")

In [3]:
table = webpage.find("table", attrs={"class", "table full table-hover tablesorter"})
headers = [header.getText() for header in table.find_all("th")]

In [4]:
#first indice was headers
raw_data = table.find_all("tr")[1:]

In [5]:
#collects string stat for each player into a list
data = [[stat.getText() for stat in player.find_all("td")] for player in raw_data]

In [6]:
df = pd.DataFrame(data, columns=headers)

In [7]:
#changes form of LAST FIRSTFIRST LAST to first-last that corresponds to SRCBB URL
#eliminates suffix, replaces whitespace with "-" and removes accented letters
def to_srcbb(name):
    name = re.sub(r'[\.,\']', '', name[int(len(name)/2):].lower())
    
    suffix = re.search("\s(jr|sr|ii|iii|iv)\Z", name)
    if suffix:
        name = name[:suffix.start()] + name[suffix.start() + 1:]
        
    name = name.replace(' ', "-")
    return unidecode(name)
srcbb_name = df['Name'].apply(to_srcbb)
df['Name'] = df['Name'].apply(lambda name: name[int(len(name)/2):])
#df['Ht'] = df['Ht'].apply(lambda ht: ht[ht.index("-") - 1:])

In [8]:
#filters out players that already transferred to new school
transfers = df.loc[df['New School'] == ""].copy().drop(['New School', 'Source'], axis=1)

#transfer df formatting
transfers['Ht'] = transfers['Ht'].apply(lambda ht: ht[:ht.index("-") - 1])
transfers.insert(3, "SRCBB Name", srcbb_name)

In [9]:
#data = transfers.copy().drop(['Stars', 'Name', 'Class', 'Wt', 'Immediately Eligible', 'January Eligible', 'Previous School'], axis=1)
transfers

Unnamed: 0,Stars,Position,Name,SRCBB Name,Class,Ht,Wt,Immediately Eligible,January Eligible,Previous School
1,2,PG,Henry Abraham,henry-abraham,FR,72,175,,,Eastern Illinois
3,2,SG,Kani Acree,kani-acree,RS SO,78,185,,,Ball State
6,2,SG,Daniel Begovich,daniel-begovich,SR,77,205,,,Stanford
8,2,SG,Zion Bethea,zion-bethea,FR,75,205,,,Hofstra
9,2,PG,Troy Boynton,troy-boynton,FR,76,175,,,Evansville
...,...,...,...,...,...,...,...,...,...,...
145,2,PG,Shaun Williams,shaun-williams,RS SO,75,175,Yes,,Cal State Bakersfield
146,2,SF,Sai Witt,sai-witt,JR,80,230,,,Texas–Rio Grande Valley
147,4,PG,Elijah Wood,elijah-wood,FR,77,175,,,Eastern Illinois
148,2,PG,Ryan Zambie,ryan-zambie,FR,75,195,,,Lafayette


In [10]:
players = transfers["SRCBB Name"].tolist()

In [15]:
#collect data for player into array
m_per_game = ["g", "gs", "mp_per_g", "fg_per_g", "fta_per_g", "trb_per_g", "ast_per_g", "stl_per_g", "blk_per_g", "tov_per_g", "pts_per_g", "sos"]
m_per_hundred = ["off_rtg", "def_rtg"]
m_advanced = ["per", "ts_pct", "efg_pct", "pprod", "trb_pct", "ast_pct", "stl_pct", "blk_pct", "tov_pct", "usg_pct", "ows", "dws"]
metrics = m_per_game + m_per_hundred + m_advanced

def metrics_len():
    return len(metrics)

#organizes desired metrics into SRCBB TABLE: METRIC
#eliminates players with no data by returning NONE with len(metric)
#uniqueness addresses duplicate player names by parsing until the most recent one
tables = {"players_per_game": m_per_game, "players_per_poss": m_per_hundred, "players_advanced": m_advanced}
def agg(name, uniqueness=1):
    srcbb = "https://www.sports-reference.com/cbb/players/" + name +"-" + str(uniqueness) + ".html"
    r_player = requests.get(srcbb)
    player = bs(r_player.content, features="html.parser")
    
    if player.find("table") is None:
        return [np.NaN] * metrics_len()
    data = []
    for table, metrics in tables.items():
        desired = player.find("table", attrs= {"class": "stats_table sortable row_summable", "id": table})
        if not desired:
            return agg(name, uniqueness + 1)
        most_recent_year = desired.find("tbody").find_all("tr")[-1].find_all("td")
        data += [cell.text for cell in most_recent_year if cell["data-stat"] in metrics]
        
    return data

data = [agg(player) for player in players]
prospects = pd.DataFrame(data, columns=metrics, index=players).dropna()
prospects = prospects.replace(r'^\s*$', 0, regex=True).astype(float)

In [16]:
prospects

Unnamed: 0,g,gs,mp_per_g,fg_per_g,fta_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,...,efg_pct,pprod,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,ows,dws
henry-abraham,16.0,16.0,33.8,2.2,0.7,2.3,2.7,0.9,0.1,2.0,...,0.505,105.0,4.1,18.0,1.7,0.2,24.4,13.0,-0.1,0.2
kani-acree,8.0,0.0,12.5,0.6,0.9,2.4,1.4,0.4,0.0,0.8,...,0.326,29.0,10.7,18.5,1.7,0.0,18.6,15.8,0.0,0.1
daniel-begovich,5.0,1.0,2.0,0.0,0.0,0.2,0.0,0.2,0.0,0.2,...,0.000,0.0,6.0,0.0,6.0,0.0,25.0,20.3,-0.1,0.0
zion-bethea,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,...,0.000,0.0,0.0,0.0,0.0,0.0,25.0,22.4,-0.1,0.0
amir-britt,3.0,0.0,3.7,0.3,0.3,0.3,0.0,0.3,0.0,0.0,...,0.500,3.0,5.1,0.0,5.4,0.0,0.0,11.3,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
jelani-williams,23.0,23.0,26.6,2.3,1.7,3.9,2.1,1.1,0.2,1.6,...,0.455,162.0,8.3,14.0,2.4,0.9,19.3,15.5,0.3,0.5
shaun-williams,11.0,6.0,17.5,2.5,0.8,2.5,1.9,1.1,0.1,1.6,...,0.419,74.0,8.7,23.5,3.7,0.7,17.6,27.2,-0.1,0.3
elijah-wood,5.0,2.0,12.4,0.8,1.0,1.0,0.8,1.0,0.0,0.0,...,0.250,14.0,4.8,14.6,4.8,0.0,0.0,17.7,-0.1,0.1
ryan-zambie,8.0,0.0,1.5,0.0,0.3,0.3,0.1,0.3,0.0,0.1,...,0.000,2.0,9.8,13.4,10.1,0.0,25.3,17.9,-0.1,0.0


In [20]:
prospects.to_csv(r'transfers.csv')