In [3]:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import numpy as np
from unidecode import unidecode
import datetime

In [4]:
#Verbal Commits as NCAA 2022 Transfer Portal database
url = "https://www.verbalcommits.com/transfers/2022"
r = requests.get(url)
webpage = bs(r.content, features="html.parser")

In [5]:
table = webpage.find("table", attrs={"class", "table full table-hover tablesorter"})
headers = [header.getText() for header in table.find_all("th")]

In [6]:
#first indice was headers
raw_data = table.find_all("tr")[1:]

In [7]:
#collects string stat for each player into a list
data = [[stat.getText() for stat in player.find_all("td")] for player in raw_data]

In [8]:
df = pd.DataFrame(data, columns=headers)
df

Unnamed: 0,Stars,Position,Name,Class,Ht,Wt,Immediately Eligible,January Eligible,Previous School,New School,Source
0,2,SG,Abee FletcherFletcher Abee,SO,756-3,180,,,The Citadel,UNC Asheville,"Justin Byerly, HoopSeen"
1,2,PF,Abercrombie RileyRiley Abercrombie,RS SO,826-10,210,Yes,,Rice,Northern Colorado,
2,2,PF,Abii MicaiahMicaiah Abii,FR,796-7,225,,,Liberty,Dallas Baptist,
3,2,PG,Abraham HenryHenry Abraham,FR,726-0,175,,,Eastern Illinois,Coastal Carolina,
4,2,PF,Acliese III LintonLinton Acliese III,RS SR,786-6,235,Yes,,San Francisco State,Eastern Washington,
...,...,...,...,...,...,...,...,...,...,...,...
1753,2,SG,Zimonjić BogdanBogdan Zimonjić,FR,776-5,190,,,Florida Atlantic,,
1754,3,PF,van der Heijden EricEric van der Heijden,FR,806-8,205,,,Ole Miss,UNCW,
1755,3,PG,Álvarez NeftalíNeftalí Álvarez,RS SO,746-2,165,Yes,,Mercer,Southern Miss,
1756,2,SG,Čubrilo VitoVito Čubrilo,SO,766-4,195,Yes,,Northeastern,,


In [9]:
#changes form of FIRST LAST to first-last that corresponds to SRCBB URL
#eliminates suffix, replaces whitespace with "-" and removes accented letters
def to_srcbb(name):
    name = re.sub(r'[\.,\']', '', name.lower())
    
    suffix = re.search("\s(jr|sr|ii|iii|iv)\Z", name)
    if suffix:
        name = name[:suffix.start()] + name[suffix.start() + 1:]
        
    name = unidecode(name.replace(' ', "-"))
    
    #srcbb = "https://www.sports-reference.com/cbb/players/" + name + "-1.html"
    #r_player = requests.get(srcbb)
    #player = bs(r_player.content, features="html.parser")
    
    #if player is None or player.find("h1") is None:
        #return None
    
    #if player.find("h1").text == 'Page Not Found (404 error)':
        #print(name)
        #name = name[-1]
    #else:
    return name

df['Name'] = df['Name'].apply(lambda name: name[int(len(name)/2):])
#df['Ht'] = df['Ht'].apply(lambda ht: ht[ht.index("-") - 1:])

In [10]:
#filters out players that already transferred to new school
transfers = df.loc[df['New School'] == ""].copy().drop(['New School', 'Source'], axis=1)

#transfer df formatting
transfers['Ht'] = transfers['Ht'].apply(lambda ht: ht[:ht.index("-") - 1])
#transfers.insert(3, "SRCBB Name", srcbb_name)
transfers

Unnamed: 0,Stars,Position,Name,Class,Ht,Wt,Immediately Eligible,January Eligible,Previous School
5,2,SG,Kani Acree,RS SO,78,185,,,Ball State
10,2,SG,Max Adelman,FR,77,205,,,Vanderbilt
12,2,PF,Emmanuel Adeoye,SO,81,230,,,Texas A&M–Commerce
20,2,SF,"Kim Aiken, Jr.",RS JR,79,215,Yes,,Arizona
24,2,PF,Daniel Akin,RS SR,81,225,Yes,,California Baptist
...,...,...,...,...,...,...,...,...,...
1747,2,PG,Ryan Zambie,FR,75,195,,,Lafayette
1751,2,PG,Levelle Zeigler,JR,73,175,,,Chicago State
1753,2,SG,Bogdan Zimonjić,FR,77,190,,,Florida Atlantic
1756,2,SG,Vito Čubrilo,SO,76,195,Yes,,Northeastern


In [11]:
transfers["srcbb_name"] = transfers["Name"].apply(to_srcbb)
transfers

Unnamed: 0,Stars,Position,Name,Class,Ht,Wt,Immediately Eligible,January Eligible,Previous School,srcbb_name
5,2,SG,Kani Acree,RS SO,78,185,,,Ball State,kani-acree
10,2,SG,Max Adelman,FR,77,205,,,Vanderbilt,max-adelman
12,2,PF,Emmanuel Adeoye,SO,81,230,,,Texas A&M–Commerce,emmanuel-adeoye
20,2,SF,"Kim Aiken, Jr.",RS JR,79,215,Yes,,Arizona,kim-aikenjr
24,2,PF,Daniel Akin,RS SR,81,225,Yes,,California Baptist,daniel-akin
...,...,...,...,...,...,...,...,...,...,...
1747,2,PG,Ryan Zambie,FR,75,195,,,Lafayette,ryan-zambie
1751,2,PG,Levelle Zeigler,JR,73,175,,,Chicago State,levelle-zeigler
1753,2,SG,Bogdan Zimonjić,FR,77,190,,,Florida Atlantic,bogdan-zimonjic
1756,2,SG,Vito Čubrilo,SO,76,195,Yes,,Northeastern,vito-cubrilo


In [12]:
#data = transfers.copy().drop(['Stars', 'Name', 'Class', 'Wt', 'Immediately Eligible', 'January Eligible', 'Previous School'], axis=1)
transfers

Unnamed: 0,Stars,Position,Name,Class,Ht,Wt,Immediately Eligible,January Eligible,Previous School,srcbb_name
5,2,SG,Kani Acree,RS SO,78,185,,,Ball State,kani-acree
10,2,SG,Max Adelman,FR,77,205,,,Vanderbilt,max-adelman
12,2,PF,Emmanuel Adeoye,SO,81,230,,,Texas A&M–Commerce,emmanuel-adeoye
20,2,SF,"Kim Aiken, Jr.",RS JR,79,215,Yes,,Arizona,kim-aikenjr
24,2,PF,Daniel Akin,RS SR,81,225,Yes,,California Baptist,daniel-akin
...,...,...,...,...,...,...,...,...,...,...
1747,2,PG,Ryan Zambie,FR,75,195,,,Lafayette,ryan-zambie
1751,2,PG,Levelle Zeigler,JR,73,175,,,Chicago State,levelle-zeigler
1753,2,SG,Bogdan Zimonjić,FR,77,190,,,Florida Atlantic,bogdan-zimonjic
1756,2,SG,Vito Čubrilo,SO,76,195,Yes,,Northeastern,vito-cubrilo


In [13]:
players = transfers["srcbb_name"].tolist()
players = list(filter(None, players))
players

['kani-acree',
 'max-adelman',
 'emmanuel-adeoye',
 'kim-aikenjr',
 'daniel-akin',
 'mayowa-akinsanya',
 'dominique-alexander',
 'jonathan-alexandre',
 'logan-alters',
 'andrew-anderson',
 'jackson-anderson',
 'adam-anhold',
 'david-appelgren',
 'yigit-arcan',
 'andrei-arion',
 'bryon-armstrong',
 'austin-ash',
 'ata-atsuren',
 'chuma-azinge',
 'victor-baffuto',
 'myles-baker',
 'junior-ballard',
 'rob-banks',
 'elijah-barnes',
 'timothy-barnes',
 'jaylen-bartley',
 'dmarco-baucum',
 'darius-beane',
 'james-beck',
 'daniel-begovich',
 'myles-belyeu',
 'james-berryiii',
 'quavon-blackwood',
 'quinn-blair',
 'justin-bofenkamp',
 'za-ontay-boothman',
 'trey-boston',
 'andre-bottoms',
 'elijah-bowens',
 'johnny-braggs',
 'connor-braun',
 'elijah-bridgers',
 'jerroda-briscoe',
 'amir-britt',
 'isaiah-broady',
 'alex-brodsky',
 'austin-brown',
 'johnny-brown',
 'justin-brown',
 'wynton-brown',
 'jordan-burge',
 'antwuan-butler',
 'rashamel-butler',
 'jaylen-butz',
 'jaden-byers',
 'hakim-byr

In [40]:
#organizes desired metrics into SRCBB TABLE: METRIC
#eliminates players with no data by returning NONE with len(metric)
#uniqueness addresses duplicate player names by checking correct timeline for player career with current time
tables = ["players_per_game", "players_per_poss", "players_advanced"]
def agg(name, uniqueness=1):

    srcbb = "https://www.sports-reference.com/cbb/players/" + name +"-" + str(uniqueness) + ".html"
    r_player = requests.get(srcbb)
    player = bs(r_player.content, features="html.parser")
    
    latest_year = player.find("table", attrs= {"class": "stats_table sortable row_summable", "id": "players_per_game"})
    
    if player.find("table") is None or latest_year is None:
        return
    
    latest_year = latest_year.find("tbody").find_all("tr")[-1].find("th").text
    
    if int(datetime.date.today().strftime("%Y")) < int(latest_year[:latest_year.index("-")]) + 6:
        for table in tables:
            desired_tbl = player.find("table", attrs= {"class": "stats_table sortable row_summable", "id": table})
            df = pd.read_html(str(desired_tbl))[0]
            df = df[df['Season'] == latest_year]
            df['Name'] = name
            if desired_tbl is None:
                return
            
            if table == "players_per_game":
                per_game = df
            if table == "players_per_poss":
                per_poss = df
            if table == "players_advanced":
                advanced = df
        return per_game, per_poss, advanced
    
    else:
        return agg(name, uniqueness + 1)

In [41]:
print(agg('alec-woodard'))

(    Season       School Conf   G  GS    MP   FG  FGA    FG%   2P  ...  TRB  \
2  2021-22  Austin Peay  OVC  24  13  19.7  1.6  4.5  0.364  0.8  ...  3.0   

   AST  STL  BLK  TOV   PF  PTS  Unnamed: 27   SOS          Name  
2  1.8  0.8  0.1  1.0  2.3  4.7          NaN -3.39  alec-woodard  

[1 rows x 30 columns],     Season       School Conf   G  GS   MP   FG   FGA    FG%   2P  ...  AST  \
2  2021-22  Austin Peay  OVC  24  13  472  5.0  13.7  0.364  2.4  ...  5.4   

   STL  BLK  TOV   PF   PTS  Unnamed: 25   ORtg   DRtg          Name  
2  2.3  0.4  3.1  7.1  14.5          NaN  100.9  103.4  alec-woodard  

[1 rows x 29 columns],     Season       School Conf   G  GS   MP   PER    TS%   eFG%   3PAr  ...  \
2  2021-22  Austin Peay  OVC  24  13  472  11.1  0.483  0.458  0.636  ...   

   Unnamed: 20  OWS  DWS   WS  WS/40  Unnamed: 25  OBPM  DBPM  BPM  \
2          NaN  0.4  0.6  1.0  0.081          NaN  -3.1   0.3 -2.8   

           Name  
2  alec-woodard  

[1 rows x 30 columns])


In [42]:
per_game = pd.DataFrame()
per_poss = pd.DataFrame()
advanced = pd.DataFrame()

for player in players:
    data = agg(player)
    if data is not None:
        per_game = per_game.append(data[0])
        per_poss = per_poss.append(data[1])
        advanced = advanced.append(data[2])

In [43]:
per_game

Unnamed: 0,Season,School,Conf,G,GS,MP,FG,FGA,FG%,2P,...,TRB,AST,STL,BLK,TOV,PF,PTS,Unnamed: 27,SOS,Name
2,2021-22,Ball State,MAC,8,0,12.5,0.6,2.9,0.217,0.0,...,2.4,1.4,0.4,0.0,0.8,1.1,2.6,,-4.56,kani-acree
1,2021-22,Vanderbilt,SEC,4,0,2.3,0.0,0.5,0.0,0.0,...,0.5,0.5,0.0,0.0,0.0,0.0,0.0,,8.05,max-adelman
3,2021-22,Arizona,Pac-12,7,0,13.6,1.9,3.9,0.481,0.6,...,3.4,2.0,1.0,0.3,0.4,1.1,5.0,,6.84,kim-aikenjr
4,2021-22,California Baptist,WAC,34,33,26.7,3.3,6.1,0.541,3.3,...,8.1,1.5,0.7,1.0,2.4,3.2,10.8,,-3.34,daniel-akin
0,2021-22,Chicago State,WAC,21,10,26.9,2.3,6.6,0.353,1.2,...,3.4,2.6,1.2,0.0,2.0,2.3,7.8,,-1.96,dominique-alexander
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,2021-22,Lafayette,Patriot,9,0,1.7,0.0,0.2,0.0,0.0,...,0.2,0.1,0.2,0.0,0.1,0.2,0.0,,-6.67,ryan-zambie
2,2020-21,Chicago State,WAC,7,4,31.0,3.9,10.0,.386,2.9,...,1.9,2.4,0.6,0.0,1.9,1.1,10.9,,1.41,levelle-zeigler
0,2021-22,Florida Atlantic,CUSA,18,0,5.6,0.7,1.7,0.419,0.3,...,0.8,0.2,0.3,0.0,0.1,0.3,2.1,,-1.82,bogdan-zimonjic
2,2021-22,Northeastern,CAA,16,7,13.8,1.1,2.6,0.439,0.6,...,0.9,0.4,0.4,0.1,0.9,1.1,3.1,,-1.19,vito-cubrilo


In [44]:
per_poss

Unnamed: 0,Season,School,Conf,G,GS,MP,FG,FGA,FG%,2P,...,AST,STL,BLK,TOV,PF,PTS,Unnamed: 25,ORtg,DRtg,Name
2,2021-22,Ball State,MAC,8,0,100,2.8,12.9,0.217,0.0,...,6.1,1.7,0.0,3.4,5.0,11.7,,95.5,109.5,kani-acree
1,2021-22,Vanderbilt,SEC,4,0,9,0.0,12.9,0.000,0.0,...,12.9,0.0,0.0,0.0,0.0,0.0,,58.3,100.9,max-adelman
3,2021-22,Arizona,Pac-12,7,0,95,7.5,15.6,0.481,2.3,...,8.1,4.1,1.2,1.7,4.6,20.3,,139.0,88.7,kim-aikenjr
4,2021-22,California Baptist,WAC,34,33,908,7.2,13.3,0.541,7.2,...,3.2,1.6,2.2,5.1,6.9,23.4,,109.0,91.9,daniel-akin
0,2021-22,Chicago State,WAC,21,10,564,5.2,14.8,0.353,2.8,...,5.7,2.8,0.1,4.5,5.1,17.3,,100.7,111.8,dominique-alexander
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,2021-22,Lafayette,Patriot,9,0,15,0.0,8.1,0.000,0.0,...,4.0,8.1,0.0,4.0,8.1,0.0,,44.2,101.2,ryan-zambie
0,2020-21,Chicago State,WAC,7,4,217,7.0,18.1,0.386,5.2,...,4.4,1.0,0.0,3.4,2.1,19.7,,89.0,129.9,levelle-zeigler
0,2021-22,Florida Atlantic,CUSA,18,0,101,7.5,17.9,0.419,2.9,...,2.3,3.5,0.0,1.2,3.5,21.9,,124.5,99.8,bogdan-zimonjic
2,2021-22,Northeastern,CAA,16,7,221,5.0,11.3,0.439,2.5,...,1.9,1.9,0.3,4.1,5.0,13.6,,89.7,108.9,vito-cubrilo


In [45]:
advanced

Unnamed: 0,Season,School,Conf,G,GS,MP,PER,TS%,eFG%,3PAr,...,Unnamed: 20,OWS,DWS,WS,WS/40,Unnamed: 25,OBPM,DBPM,BPM,Name
2,2021-22,Ball State,MAC,8,0,100,8.9,0.399,0.326,0.391,...,,0.0,0.1,0.1,0.042,,-2.0,-0.6,-2.6,kani-acree
1,2021-22,Vanderbilt,SEC,4,0,9,2.5,0.000,0.000,0.500,...,,0.0,0.0,0.0,-0.018,,-6.8,1.5,-5.2,max-adelman
3,2021-22,Arizona,Pac-12,7,0,95,25.1,0.626,0.648,0.593,...,,0.4,0.2,0.6,0.259,,6.5,6.4,12.9,kim-aikenjr
4,2021-22,California Baptist,WAC,34,33,908,19.4,0.602,0.541,0.005,...,,1.8,2.1,3.9,0.170,,-0.8,1.1,0.3,daniel-akin
0,2021-22,Chicago State,WAC,21,10,564,12.6,0.502,0.435,0.518,...,,0.5,0.2,0.8,0.054,,-1.6,-1.0,-2.6,dominique-alexander
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,2021-22,Lafayette,Patriot,9,0,15,1.5,0.000,0.000,0.000,...,,-0.1,0.0,0.0,-0.092,,-11.1,-1.5,-12.7,ryan-zambie
0,2020-21,Chicago State,WAC,7,4,217,9.9,0.472,0.436,0.371,...,,0.0,-0.3,-0.3,-0.060,,-2.7,-6.3,-9.0,levelle-zeigler
0,2021-22,Florida Atlantic,CUSA,18,0,101,18.8,0.561,0.548,0.548,...,,0.3,0.2,0.4,0.173,,-1.1,1.0,-0.1,bogdan-zimonjic
2,2021-22,Northeastern,CAA,16,7,221,6.7,0.565,0.549,0.659,...,,0.0,0.1,0.1,0.026,,-4.2,0.4,-3.8,vito-cubrilo


In [51]:
per_game.to_csv('per_game.csv')
per_poss.to_csv('per_poss.csv')
advanced.to_csv('advanced.csv')
transfers.to_csv('transfers.csv')

In [None]:
##check page not found
## check empty player