In [1]:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import numpy as np

In [2]:
#Verbal Commits as NCAA 2022 Transfer Portal database
url = "https://www.verbalcommits.com/transfers/2022"
r = requests.get(url)
webpage = bs(r.content, features="html.parser")

In [3]:
table = webpage.find("table", attrs={"class", "table full table-hover tablesorter"})
headers = [header.getText() for header in table.find_all("th")]

In [4]:
#first indice was headers
raw_data = table.find_all("tr")[1:]

In [5]:
#collects string stat for each player into a list
data = [[stat.getText() for stat in player.find_all("td")] for player in raw_data]

In [6]:
df = pd.DataFrame(data, columns=headers)
df

Unnamed: 0,Stars,Position,Name,Class,Ht,Wt,Immediately Eligible,January Eligible,Previous School,New School,Source
0,2,SG,Abee FletcherFletcher Abee,SO,756-3,180,,,The Citadel,UNC Asheville,"Justin Byerly, HoopSeen"
1,2,PG,Abraham HenryHenry Abraham,FR,726-0,175,,,Eastern Illinois,,
2,2,PF,Acliese III LintonLinton Acliese III,RS SR,786-6,235,Yes,,San Francisco State,Eastern Washington,
3,2,SG,Acree KaniKani Acree,RS SO,786-6,185,,,Ball State,,
4,2,PF,Acunzo MattiaMattia Acunzo,RS FR,806-8,225,,,Robert Morris,Youngstown State,"Emiliano Carchia, Sportando"
...,...,...,...,...,...,...,...,...,...,...,...
140,2,PG,Williams ShaunShaun Williams,RS SO,756-3,175,Yes,,Cal State Bakersfield,,
141,2,SF,Witt SaiSai Witt,JR,806-8,230,,,Texas–Rio Grande Valley,,
142,4,PG,Wood ElijahElijah Wood,FR,776-5,175,,,Eastern Illinois,,
143,2,PG,Zambie RyanRyan Zambie,FR,756-3,195,,,Lafayette,,


In [7]:
#changes form of LAST FIRSTFIRST LAST to first-last that corresponds to SRCBB URL
def to_srcbb(name):
    name = re.sub(r'[\.,\']', '', name[int(len(name)/2):].lower())
    
    suffix = re.search("\s(jr|sr|ii|iii|iv)\Z", name)
    if suffix:
        name = name[:suffix.start()] + name[suffix.start() + 1:]
        
    name = name.replace(' ', "-")
    return name
srcbb_name = df['Name'].apply(to_srcbb)
df['Name'] = df['Name'].apply(lambda name: name[int(len(name)/2):])
#df['Ht'] = df['Ht'].apply(lambda ht: ht[ht.index("-") - 1:])


In [8]:
transfers = df.loc[df['New School'] == ""].copy().drop(['New School', 'Source'], axis=1)
transfers['Ht'] = transfers['Ht'].apply(lambda ht: ht[:ht.index("-") - 1])
transfers.insert(3, "SRCBB Name", srcbb_name)

In [9]:
#data = transfers.copy().drop(['Stars', 'Name', 'Class', 'Wt', 'Immediately Eligible', 'January Eligible', 'Previous School'], axis=1)
#data
transfers

Unnamed: 0,Stars,Position,Name,SRCBB Name,Class,Ht,Wt,Immediately Eligible,January Eligible,Previous School
1,2,PG,Henry Abraham,henry-abraham,FR,72,175,,,Eastern Illinois
3,2,SG,Kani Acree,kani-acree,RS SO,78,185,,,Ball State
6,2,SG,Daniel Begovich,daniel-begovich,SR,77,205,,,Stanford
8,2,SG,Zion Bethea,zion-bethea,FR,75,205,,,Hofstra
9,2,PG,Troy Boynton,troy-boynton,FR,76,175,,,Evansville
...,...,...,...,...,...,...,...,...,...,...
140,2,PG,Shaun Williams,shaun-williams,RS SO,75,175,Yes,,Cal State Bakersfield
141,2,SF,Sai Witt,sai-witt,JR,80,230,,,Texas–Rio Grande Valley
142,4,PG,Elijah Wood,elijah-wood,FR,77,175,,,Eastern Illinois
143,2,PG,Ryan Zambie,ryan-zambie,FR,75,195,,,Lafayette


In [10]:
srcbb = "https://www.sports-reference.com/cbb/players/" + 'henry-abraham' +"-1.html"
r_player = requests.get(srcbb)
player = bs(r_player.content, features="html.parser")

data = pd.DataFrame(columns=['Ht', 'Mins/G', '2PA', '3PA', '3P%', 'PTS', 'SOS', 'ORtg', 'DRtg', 'TS%', 'eFG%', 'PProd', 'TRB%', 'AST%'])
data

Unnamed: 0,Ht,Mins/G,2PA,3PA,3P%,PTS,SOS,ORtg,DRtg,TS%,eFG%,PProd,TRB%,AST%


In [76]:
#collect data from specific table, reference based on SRC_BB name from transfers table

#Per Game: 
m_per_game = ["g", "gs", "mp_per_g", "fg_pct", "pts_per_g", "sos"]
per_game = player.find("table", attrs= {"class": "stats_table sortable row_summable", "id":"players_per_game"})
per_game_2022 = per_game.find("tr", id="players_per_game.2022").find_all("td")
d_per_game = [float(cell.text) for cell in per_game_2022 if cell["data-stat"] in m_per_game]

#Per 100 POSS:
m_per_hundred = ["off_rtg", "def_rtg"]
per_hundred = player.find("table", attrs= {"class": "stats_table sortable row_summable", "id":"players_per_poss"})
per_hundred_2022 = per_hundred.find("tr", id="players_per_poss.2022").find_all("td")
d_per_hundred = [float(cell.text) for cell in per_hundred_2022 if cell["data-stat"] in m_per_hundred]

#Advanced:
m_advanced = ["per", "ts_pct", "efg_pct", "usg_pct", "ows", "dws"]
advanced = player.find("table", attrs= {"class": "stats_table sortable row_summable", "id":"players_advanced"})
advanced_2022 = advanced.find("tr", id="players_advanced.2022").find_all("td")
d_advanced = [float(cell.text) for cell in advanced_2022 if cell["data-stat"] in m_advanced]

#join lists together for each player and then append to DATA 
d_player = d_per_game + d_per_hundred + d_advanced
d_player

[16.0,
 16.0,
 33.8,
 0.372,
 6.3,
 -2.7,
 86.9,
 111.5,
 6.6,
 0.509,
 0.505,
 13.0,
 -0.1,
 0.2]