In [1]:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import numpy as np
from unidecode import unidecode
import datetime

In [2]:
#Verbal Commits as NCAA 2022 Transfer Portal database
url = "https://www.verbalcommits.com/transfers/2022"
r = requests.get(url)
webpage = bs(r.content, features="html.parser")

In [3]:
table = webpage.find("table", attrs={"class", "table full table-hover tablesorter"})
headers = [header.getText() for header in table.find_all("th")]

In [4]:
#first indice was headers
raw_data = table.find_all("tr")[1:]

In [5]:
#collects string stat for each player into a list
data = [[stat.getText() for stat in player.find_all("td")] for player in raw_data]

In [6]:
df = pd.DataFrame(data, columns=headers)

In [7]:
#changes form of LAST FIRSTFIRST LAST to first-last that corresponds to SRCBB URL
#eliminates suffix, replaces whitespace with "-" and removes accented letters
def to_srcbb(name):
    name = re.sub(r'[\.,\']', '', name[int(len(name)/2):].lower())
    
    suffix = re.search("\s(jr|sr|ii|iii|iv)\Z", name)
    if suffix:
        name = name[:suffix.start()] + name[suffix.start() + 1:]
        
    name = unidecode(name.replace(' ', "-"))
    
    srcbb = "https://www.sports-reference.com/cbb/players/" + name + "-1.html"
    r_player = requests.get(srcbb)
    player = bs(r_player.content, features="html.parser")
    
    if player.find("h1").text == 'Page Not Found (404 error)':
        print(name)
        #name = name[-1]
    else:
        return name
srcbb_name = df['Name'].apply(to_srcbb)
df['Name'] = df['Name'].apply(lambda name: name[int(len(name)/2):])
#df['Ht'] = df['Ht'].apply(lambda ht: ht[ht.index("-") - 1:])

travis-andersonii
ty-brewer
cameron-burrell
kale-catchings
tyler-chisom
junior-clay
marvin-colemanii
ej-dambreville
cj-gettelfinger
matt-gray
ohn-dj-mitchell
pape-momar-cisse
javian-mosley
ed-oliver-hampton
dylan-o'hearn
quay-primas
joe-reece
joey-st-pierre
micheal-tatejr
faite-williams


In [9]:
#filters out players that already transferred to new school
transfers = df.loc[df['New School'] == ""].copy().drop(['New School', 'Source'], axis=1)

#transfer df formatting
transfers['Ht'] = transfers['Ht'].apply(lambda ht: ht[:ht.index("-") - 1])
transfers.insert(3, "SRCBB Name", srcbb_name)

In [10]:
#data = transfers.copy().drop(['Stars', 'Name', 'Class', 'Wt', 'Immediately Eligible', 'January Eligible', 'Previous School'], axis=1)
transfers

Unnamed: 0,Stars,Position,Name,SRCBB Name,Class,Ht,Wt,Immediately Eligible,January Eligible,Previous School
1,2,PG,Henry Abraham,henry-abraham,FR,72,175,,,Eastern Illinois
3,2,SG,Kani Acree,kani-acree,RS SO,78,185,,,Ball State
6,2,SF,Ivan Alipiev,ivan-alipiev,JR,80,215,Yes,,Loyola Marymount
7,2,PF,Barlow Alleruzzo IV,barlow-alleruzzoiv,JR,81,235,,,Eastern Illinois
8,2,PG,Travis Anderson II,,RS SO,72,175,Yes,,Charleston Southern
...,...,...,...,...,...,...,...,...,...,...
287,3,SG,Zion Young,zion-young,JR,75,220,Yes,,Oakland
288,2,SG,Brandon Younger,brandon-younger,JR,79,190,,,Presbyterian
289,2,PG,Ryan Zambie,ryan-zambie,FR,75,195,,,Lafayette
290,2,PG,Levelle Zeigler,levelle-zeigler,JR,73,175,,,Chicago State


In [13]:
players = transfers["SRCBB Name"].tolist()
players = list(filter(None, players))
players

['henry-abraham',
 'kani-acree',
 'ivan-alipiev',
 'barlow-alleruzzoiv',
 'adam-anhold',
 'jaxson-baker',
 'tariq-balogun',
 'daniel-begovich',
 'zion-bethea',
 'dahmir-bishop',
 'justin-bofenkamp',
 'rashad-bolden',
 'troy-boynton',
 'logan-bracamonte',
 'connor-braun',
 'ledarrius-brewer',
 'amir-britt',
 'hayden-brown',
 'jomaru-brown',
 'robert-brown',
 'darius-brownii',
 'paul-bruns',
 'caleb-burgess',
 'kenny-burns',
 'braelon-bush',
 'laquan-butler',
 'gedeon-buzangu',
 'jonah-carrasco',
 'marsei-caston',
 'chris-childs',
 'tanner-christensen',
 'ej-clark',
 'kareem-clark',
 'tyzhaun-claude',
 'jeriah-coleman',
 'kaleb-coleman',
 'maurice-commander',
 'tasos-cook',
 'kvonn-cramer',
 'keishawn-davidson',
 'brent-davis',
 'mike-depersia',
 'sean-duke',
 'devon-dunn',
 'sean-durugordon',
 'grehlon-easter',
 'greg-eboigbodin',
 'nicolas-elame',
 'david-elliottiv',
 'quirin-emanga',
 'alsean-evans',
 'isaac-farah',
 'elijah-farr',
 'omar-figueroa',
 'jaquavian-florence',
 'jj-flores'

In [27]:
#collect data for player into array
m_per_game = ["g", "gs", "mp_per_g", "fg_per_g", "fta_per_g", "trb_per_g", "ast_per_g", "stl_per_g", "blk_per_g", "tov_per_g", "pts_per_g", "sos"]
m_per_hundred = ["off_rtg", "def_rtg"]
m_advanced = ["per", "ts_pct", "efg_pct", "pprod", "trb_pct", "ast_pct", "stl_pct", "blk_pct", "tov_pct", "usg_pct", "ows", "dws"]
metrics = m_per_game + m_per_hundred + m_advanced

def metrics_len():
    return len(metrics)

#organizes desired metrics into SRCBB TABLE: METRIC
#eliminates players with no data by returning NONE with len(metric)
#uniqueness addresses duplicate player names by checking correct timeline for player career with current time
tables = {"players_per_game": m_per_game, "players_per_poss": m_per_hundred, "players_advanced": m_advanced}
def agg(name, uniqueness=1):
    srcbb = "https://www.sports-reference.com/cbb/players/" + name +"-" + str(uniqueness) + ".html"
    r_player = requests.get(srcbb)
    player = bs(r_player.content, features="html.parser")
    
    print(name)
    latest_year = player.find("table", attrs= {"class": "stats_table sortable row_summable", "id": "players_per_game"})
    
    if player.find("table") is None or latest_year is None:
        return [np.NaN] * metrics_len()
    

    latest_year = latest_year.find("tbody").find_all("tr")[-1].find("th").text
    
    if int(datetime.date.today().strftime("%Y")) < int(latest_year[:latest_year.index("-")]) + 6:
        data = []
        for table, metrics in tables.items():
            desired_tbl = player.find("table", attrs= {"class": "stats_table sortable row_summable", "id": table})
            if desired_tbl is None:
                return [np.NaN] * metrics_len()
            most_recent_year = desired_tbl.find_all("tr")[-1].find_all("td")
            data += [cell.text for cell in most_recent_year if cell["data-stat"] in metrics]
        return data
    
    else:
        return agg(name, uniqueness + 1)
#creates data list for players, drops np.NaN rows and replace '' with 0
#converts data table to float values
data = [agg(player) for player in players]
prospects = pd.DataFrame(data, columns=metrics, index=players).dropna()
prospects = prospects.replace(r'^\s*$', 0, regex=True).astype(float)

henry-abraham
kani-acree
ivan-alipiev
barlow-alleruzzoiv
adam-anhold
jaxson-baker
tariq-balogun
daniel-begovich
zion-bethea
dahmir-bishop
justin-bofenkamp
rashad-bolden
troy-boynton
logan-bracamonte
connor-braun
ledarrius-brewer
amir-britt
hayden-brown
jomaru-brown
robert-brown
robert-brown
darius-brownii
paul-bruns
paul-bruns
caleb-burgess
kenny-burns
braelon-bush
laquan-butler
gedeon-buzangu
jonah-carrasco
marsei-caston
chris-childs
chris-childs
tanner-christensen
ej-clark
kareem-clark
tyzhaun-claude
jeriah-coleman
kaleb-coleman
maurice-commander
tasos-cook
kvonn-cramer
keishawn-davidson
brent-davis
mike-depersia
sean-duke
devon-dunn
sean-durugordon
grehlon-easter
greg-eboigbodin
nicolas-elame
david-elliottiv
quirin-emanga
alsean-evans
isaac-farah
elijah-farr
omar-figueroa
jaquavian-florence
jj-flores
mason-forbes
marco-foster
xavier-foster
otis-frazieriii
spencer-freedman
andrew-funk
jaylan-gainey
jackson-gammons
deandre-gholston
jaylon-gibson
mason-gibson
zeb-graham
james-grahamiii

In [28]:
prospects

Unnamed: 0,g,gs,mp_per_g,fg_per_g,fta_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,...,efg_pct,pprod,trb_pct,ast_pct,stl_pct,blk_pct,tov_pct,usg_pct,ows,dws
henry-abraham,38.0,25.0,27.5,2.0,0.7,1.8,2.0,0.8,0.1,1.4,...,0.493,221.0,3.8,15.1,1.6,0.2,19.0,13.6,0.2,0.4
kani-acree,56.0,3.0,15.8,1.2,1.1,3.0,1.1,0.4,0.1,0.9,...,0.395,236.0,10.4,12.7,1.5,0.5,17.7,15.7,0.4,1.2
ivan-alipiev,70.0,28.0,20.3,2.3,1.0,2.8,1.0,0.5,0.2,1.1,...,0.499,425.0,9.1,10.5,1.5,1.2,15.6,18.9,1.5,1.4
barlow-alleruzzoiv,26.0,8.0,15.8,1.0,1.0,2.6,0.4,0.2,0.2,0.7,...,0.396,87.0,9.5,4.4,0.6,1.1,16.8,13.8,0.0,0.2
adam-anhold,28.0,0.0,9.3,1.4,1.5,2.5,0.4,0.1,0.5,0.6,...,0.456,113.0,14.7,8.0,0.8,5.2,13.9,23.3,0.4,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zion-young,37.0,6.0,17.6,2.3,1.4,2.6,0.4,0.3,0.1,0.8,...,0.463,234.0,8.1,4.9,1.0,0.5,10.3,22.6,0.7,0.0
brandon-younger,52.0,37.0,22.3,2.3,1.1,3.7,0.6,1.0,0.4,1.2,...,0.480,309.0,10.1,5.8,2.8,2.1,16.0,16.8,0.9,1.8
ryan-zambie,13.0,0.0,1.3,0.0,0.2,0.2,0.1,0.2,0.0,0.1,...,0.000,2.0,10.2,9.5,7.1,0.0,25.3,12.6,-0.1,0.0
levelle-zeigler,7.0,4.0,31.0,3.9,3.1,1.9,2.4,0.6,0.0,1.9,...,0.436,74.0,3.5,20.9,1.0,0.0,13.9,22.2,0.0,-0.3


In [29]:
prospects.to_csv(r'transfers.csv')

In [26]:
srcbb = "https://www.sports-reference.com/cbb/players/brandon-hall-2.html"
r_player = requests.get(srcbb)
player = bs(r_player.content, features="html.parser")

latest = player.find("table")

latest
print(latest)



None


In [None]:

datetime.date.today().strftime("%Y")


In [None]:
def agg(name, uniqueness=1):
    srcbb = "https://www.sports-reference.com/cbb/players/" + name +"-" + str(uniqueness) + ".html"
    r_player = requests.get(srcbb)
    player = bs(r_player.content, features="html.parser")
    
    print(name)
    latest_year = player.find("table", attrs= {"class": "stats_table sortable row_summable", "id": "players_per_game"})
    
    if latest_year is None:
        return agg(name, uniqueness + 1) 
    latest_year = latest_year.find("tbody").find_all("tr")[-1].find("th").text
    
    if int(datetime.date.today().strftime("%Y")) < int(latest_year[:latest_year.index("-")]) + 6:
        data = []
        for table, metrics in tables.items():
            desired_tbl = player.find("table", attrs= {"class": "stats_table sortable row_summable", "id": table})
            if desired_tbl is None:
                return [np.NaN] * metrics_len()
            most_recent_year = desired_tbl.find_all("tr")[-1].find_all("td")
            data += [cell.text for cell in most_recent_year if cell["data-stat"] in metrics]
        return data
    
    else:
        return agg(name, uniqueness + 1) 

In [None]:
##check page not found
## check empty player