In [35]:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import numpy as np
from unidecode import unidecode

In [36]:
#Verbal Commits as NCAA 2022 Transfer Portal database
url = "https://www.verbalcommits.com/transfers/2022"
r = requests.get(url)
webpage = bs(r.content, features="html.parser")

In [37]:
table = webpage.find("table", attrs={"class", "table full table-hover tablesorter"})
headers = [header.getText() for header in table.find_all("th")]

In [38]:
#first indice was headers
raw_data = table.find_all("tr")[1:]

In [39]:
#collects string stat for each player into a list
data = [[stat.getText() for stat in player.find_all("td")] for player in raw_data]
data

[['2',
  'SG',
  'Abee FletcherFletcher Abee',
  'SO',
  '756-3',
  '180',
  '',
  '',
  'The Citadel',
  'UNC Asheville',
  'Justin Byerly, HoopSeen'],
 ['2',
  'PG',
  'Abraham HenryHenry Abraham',
  'FR',
  '726-0',
  '175',
  '',
  '',
  'Eastern Illinois',
  '',
  ''],
 ['2',
  'PF',
  'Acliese III LintonLinton Acliese III',
  'RS SR',
  '786-6',
  '235',
  'Yes',
  '',
  'San Francisco State',
  'Eastern Washington',
  ''],
 ['2',
  'SG',
  'Acree KaniKani Acree',
  'RS SO',
  '786-6',
  '185',
  '',
  '',
  'Ball State',
  '',
  ''],
 ['2',
  'PF',
  'Acunzo MattiaMattia Acunzo',
  'RS FR',
  '806-8',
  '225',
  '',
  '',
  'Robert Morris',
  'Youngstown State',
  'Emiliano Carchia, Sportando'],
 ['2',
  'PG',
  "Ali Fah'mirFah'mir Ali",
  'FR',
  '705-10',
  '180',
  '',
  '',
  'College of Charleston',
  'Delaware State',
  'Tobias Bass, NXTPRO'],
 ['2',
  'SG',
  'Begovich DanielDaniel Begovich',
  'SR',
  '776-5',
  '205',
  '',
  '',
  'Stanford',
  '',
  ''],
 ['2',
  'C',

In [40]:
df = pd.DataFrame(data, columns=headers)

In [41]:
#changes form of LAST FIRSTFIRST LAST to first-last that corresponds to SRCBB URL
def to_srcbb(name):
    name = re.sub(r'[\.,\']', '', name[int(len(name)/2):].lower())
    
    suffix = re.search("\s(jr|sr|ii|iii|iv)\Z", name)
    if suffix:
        name = name[:suffix.start()] + name[suffix.start() + 1:]
        
    name = name.replace(' ', "-")
    return unidecode(name)
srcbb_name = df['Name'].apply(to_srcbb)
df['Name'] = df['Name'].apply(lambda name: name[int(len(name)/2):])
#df['Ht'] = df['Ht'].apply(lambda ht: ht[ht.index("-") - 1:])

In [42]:
#filters out players that already transferred to new school
transfers = df.loc[df['New School'] == ""].copy().drop(['New School', 'Source'], axis=1)

transfers['Ht'] = transfers['Ht'].apply(lambda ht: ht[:ht.index("-") - 1])
transfers.insert(3, "SRCBB Name", srcbb_name)

In [43]:
#data = transfers.copy().drop(['Stars', 'Name', 'Class', 'Wt', 'Immediately Eligible', 'January Eligible', 'Previous School'], axis=1)
transfers

Unnamed: 0,Stars,Position,Name,SRCBB Name,Class,Ht,Wt,Immediately Eligible,January Eligible,Previous School
1,2,PG,Henry Abraham,henry-abraham,FR,72,175,,,Eastern Illinois
3,2,SG,Kani Acree,kani-acree,RS SO,78,185,,,Ball State
6,2,SG,Daniel Begovich,daniel-begovich,SR,77,205,,,Stanford
8,2,SG,Zion Bethea,zion-bethea,FR,75,205,,,Hofstra
9,2,PG,Troy Boynton,troy-boynton,FR,76,175,,,Evansville
...,...,...,...,...,...,...,...,...,...,...
145,2,PG,Shaun Williams,shaun-williams,RS SO,75,175,Yes,,Cal State Bakersfield
146,2,SF,Sai Witt,sai-witt,JR,80,230,,,Texas–Rio Grande Valley
147,4,PG,Elijah Wood,elijah-wood,FR,77,175,,,Eastern Illinois
148,2,PG,Ryan Zambie,ryan-zambie,FR,75,195,,,Lafayette


In [63]:
players = transfers["SRCBB Name"].tolist()

98

In [83]:
#collect data for player into array
m_per_game = ["g", "gs", "mp_per_g", "fg_pct", "pts_per_g", "sos"]
m_per_hundred = ["off_rtg", "def_rtg"]
m_advanced = ["per", "ts_pct", "efg_pct", "usg_pct", "ows", "dws"]
metrics = m_per_game + m_per_hundred + m_advanced

def metrics_len():
    return len(metrics)

tables = {"players_per_game": m_per_game, "players_per_poss": m_per_hundred, "players_advanced": m_advanced}
def agg(name, uniqueness=1):
    srcbb = "https://www.sports-reference.com/cbb/players/" + name +"-" + str(uniqueness) + ".html"
    r_player = requests.get(srcbb)
    player = bs(r_player.content, features="html.parser")
    
    if player.find("table") is None:
        return [None] * metrics_len()
    data = []
    for table, metrics in tables.items():
        desired = player.find("table", attrs= {"class": "stats_table sortable row_summable", "id": table})
        if not desired:
            return agg(name, uniqueness + 1)
        most_recent_year = desired.find("tbody").find_all("tr")[-1].find_all("td")
        data += [cell.text for cell in most_recent_year if cell["data-stat"] in metrics]
        
    return data

data = [agg(player) for player in players]

prospects = pd.DataFrame(data, columns=metrics, index=players)

In [84]:
prospects

Unnamed: 0,g,gs,mp_per_g,fg_pct,pts_per_g,sos,off_rtg,def_rtg,per,ts_pct,efg_pct,usg_pct,ows,dws
henry-abraham,16,16,33.8,.372,6.3,-3.40,87.1,111.4,6.6,.509,.505,13.0,-0.1,0.2
kani-acree,8,0,12.5,.217,2.6,-3.59,95.5,109.5,8.8,.399,.326,15.8,0.0,0.1
daniel-begovich,5,1,2.0,.000,0.0,8.35,0.0,95.5,-12.3,.000,.000,20.3,-0.1,0.0
zion-bethea,3,0,3.0,.000,0.0,-2.94,0.0,117.1,-26.3,.000,.000,22.4,-0.1,0.0
troy-boynton,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
shaun-williams,11,6,17.5,.350,6.6,-1.20,87.4,97.7,12.4,.433,.419,27.2,-0.1,0.3
sai-witt,,,,,,,,,,,,,,
elijah-wood,5,2,12.4,.222,2.6,-3.40,79.5,103.2,8.1,.319,.250,17.7,-0.1,0.1
ryan-zambie,8,0,1.5,.000,0.0,-5.93,44.6,96.2,3.4,.000,.000,17.9,-0.1,0.0
