In [1]:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import numpy as np
from unidecode import unidecode

In [2]:
#Verbal Commits as NCAA 2022 Transfer Portal database
url = "https://www.verbalcommits.com/transfers/2022"
r = requests.get(url)
webpage = bs(r.content, features="html.parser")

In [3]:
table = webpage.find("table", attrs={"class", "table full table-hover tablesorter"})
headers = [header.getText() for header in table.find_all("th")]

In [4]:
#first indice was headers
raw_data = table.find_all("tr")[1:]

In [5]:
#collects string stat for each player into a list
data = [[stat.getText() for stat in player.find_all("td")] for player in raw_data]
data

[['2',
  'SG',
  'Abee FletcherFletcher Abee',
  'SO',
  '756-3',
  '180',
  '',
  '',
  'The Citadel',
  'UNC Asheville',
  'Justin Byerly, HoopSeen'],
 ['2',
  'PG',
  'Abraham HenryHenry Abraham',
  'FR',
  '726-0',
  '175',
  '',
  '',
  'Eastern Illinois',
  '',
  ''],
 ['2',
  'PF',
  'Acliese III LintonLinton Acliese III',
  'RS SR',
  '786-6',
  '235',
  'Yes',
  '',
  'San Francisco State',
  'Eastern Washington',
  ''],
 ['2',
  'SG',
  'Acree KaniKani Acree',
  'RS SO',
  '786-6',
  '185',
  '',
  '',
  'Ball State',
  '',
  ''],
 ['2',
  'PF',
  'Acunzo MattiaMattia Acunzo',
  'RS FR',
  '806-8',
  '225',
  '',
  '',
  'Robert Morris',
  'Youngstown State',
  'Emiliano Carchia, Sportando'],
 ['2',
  'PG',
  "Ali Fah'mirFah'mir Ali",
  'FR',
  '705-10',
  '180',
  '',
  '',
  'College of Charleston',
  'Delaware State',
  'Tobias Bass, NXTPRO'],
 ['2',
  'SG',
  'Begovich DanielDaniel Begovich',
  'SR',
  '776-5',
  '205',
  '',
  '',
  'Stanford',
  '',
  ''],
 ['2',
  'C',

In [6]:
df = pd.DataFrame(data, columns=headers)
df

Unnamed: 0,Stars,Position,Name,Class,Ht,Wt,Immediately Eligible,January Eligible,Previous School,New School,Source
0,2,SG,Abee FletcherFletcher Abee,SO,756-3,180,,,The Citadel,UNC Asheville,"Justin Byerly, HoopSeen"
1,2,PG,Abraham HenryHenry Abraham,FR,726-0,175,,,Eastern Illinois,,
2,2,PF,Acliese III LintonLinton Acliese III,RS SR,786-6,235,Yes,,San Francisco State,Eastern Washington,
3,2,SG,Acree KaniKani Acree,RS SO,786-6,185,,,Ball State,,
4,2,PF,Acunzo MattiaMattia Acunzo,RS FR,806-8,225,,,Robert Morris,Youngstown State,"Emiliano Carchia, Sportando"
...,...,...,...,...,...,...,...,...,...,...,...
140,2,PG,Williams ShaunShaun Williams,RS SO,756-3,175,Yes,,Cal State Bakersfield,,
141,2,SF,Witt SaiSai Witt,JR,806-8,230,,,Texas–Rio Grande Valley,,
142,4,PG,Wood ElijahElijah Wood,FR,776-5,175,,,Eastern Illinois,,
143,2,PG,Zambie RyanRyan Zambie,FR,756-3,195,,,Lafayette,,


In [12]:
unidecode.unidecode("32 rue d'Athènes Paris France")

"32 rue d'Athenes Paris France"

In [13]:
#changes form of LAST FIRSTFIRST LAST to first-last that corresponds to SRCBB URL
def to_srcbb(name):
    name = re.sub(r'[\.,\']', '', name[int(len(name)/2):].lower())
    
    suffix = re.search("\s(jr|sr|ii|iii|iv)\Z", name)
    if suffix:
        name = name[:suffix.start()] + name[suffix.start() + 1:]
        
    name = name.replace(' ', "-")
    return unidecode.unidecode(name)
srcbb_name = df['Name'].apply(to_srcbb)
df['Name'] = df['Name'].apply(lambda name: name[int(len(name)/2):])
#df['Ht'] = df['Ht'].apply(lambda ht: ht[ht.index("-") - 1:])


In [14]:
#filters out players that already transferred to new school
transfers = df.loc[df['New School'] == ""].copy().drop(['New School', 'Source'], axis=1)

transfers['Ht'] = transfers['Ht'].apply(lambda ht: ht[:ht.index("-") - 1])
transfers.insert(3, "SRCBB Name", srcbb_name)

In [15]:
#data = transfers.copy().drop(['Stars', 'Name', 'Class', 'Wt', 'Immediately Eligible', 'January Eligible', 'Previous School'], axis=1)
transfers

Unnamed: 0,Stars,Position,Name,SRCBB Name,Class,Ht,Wt,Immediately Eligible,January Eligible,Previous School
1,2,PG,Henry Abraham,henry-abraham,FR,72,175,,,Eastern Illinois
3,2,SG,Kani Acree,kani-acree,RS SO,78,185,,,Ball State
6,2,SG,Daniel Begovich,daniel-begovich,SR,77,205,,,Stanford
8,2,SG,Zion Bethea,zion-bethea,FR,75,205,,,Hofstra
9,2,PG,Troy Boynton,troy-boynton,FR,76,175,,,Evansville
...,...,...,...,...,...,...,...,...,...,...
140,2,PG,Shaun Williams,shaun-williams,RS SO,75,175,Yes,,Cal State Bakersfield
141,2,SF,Sai Witt,sai-witt,JR,80,230,,,Texas–Rio Grande Valley
142,4,PG,Elijah Wood,elijah-wood,FR,77,175,,,Eastern Illinois
143,2,PG,Ryan Zambie,ryan-zambie,FR,75,195,,,Lafayette


In [16]:
players = transfers["SRCBB Name"].tolist()
players

['henry-abraham',
 'kani-acree',
 'daniel-begovich',
 'zion-bethea',
 'troy-boynton',
 'amir-britt',
 'kenny-burns',
 'braelon-bush',
 'tahj-malik-campbell',
 'marsei-caston',
 'chris-childs',
 'maurice-commander',
 'sean-duke',
 'sean-durugordon',
 'greg-eboigbodin',
 'alsean-evans',
 'elijah-farr',
 'xavier-foster',
 'james-grahamiii',
 'richie-greaves',
 'mason-grigg',
 'ismail-habib',
 'gary-harris',
 'noah-haynesworth',
 'dj-heath',
 'vante-hendrix',
 'blake-henry',
 'keaton-hervey',
 'blake-hinson',
 'mckay-howell',
 'robert-hutchens',
 'donovan-ivory',
 'tegra-izay',
 'delveion-jackson',
 'zeb-jackson',
 'dakari-johnson',
 'javan-johnson',
 'tavon-jones',
 'joel-kabimba',
 'martin-kawa',
 'naseem-khaalid',
 'kai-kostmayer',
 'chase-lane',
 'bryce-laskey',
 'zach-light',
 'angel-lopez',
 'david-loville',
 'marko-lukic',
 'dusan-mahorcic',
 'valdir-manuel',
 'travion-mccray',
 'tyce-mcnair',
 'chuka-mekkam',
 'cj-meredith',
 'miles-miller',
 'rayquawndis-mitchell',
 'jules-moor',


In [43]:

#collect data for player into array
m_per_game = ["g", "gs", "mp_per_g", "fg_pct", "pts_per_g", "sos"]
m_per_hundred = ["off_rtg", "def_rtg"]
m_advanced = ["per", "ts_pct", "efg_pct", "usg_pct", "ows", "dws"]
metrics = m_per_game + m_per_hundred + m_advanced

srcbb = "https://www.sports-reference.com/cbb/players/" + 'zion-bethea' +"-1.html"
r_player = requests.get(srcbb)
player = bs(r_player.content, features="html.parser")

# if player.find("table") is None:
#     return None

#Per Game: 
per_game = player.find("table", attrs= {"class": "stats_table sortable row_summable", "id":"players_per_game"})
print(per_game("tr")[-1])
# per_game_2022 = per_game.find("tr", id="players_per_game.2022").find_all("td")
per_game_2022 = per_game("tr")[-1]
d_per_game = [float(cell.text) for cell in per_game_2022 if cell["data-stat"] in m_per_game]

#Per 100 POSS:
per_hundred = player.find("table", attrs= {"class": "stats_table sortable row_summable", "id":"players_per_poss"})
# per_hundred_2022 = per_hundred.find("tr", id="players_per_poss.2022").find_all("td")
per_hundred_2022 = per_hundred("tr")
d_per_hundred = [float(cell.text) for cell in per_hundred_2022 if cell["data-stat"] in m_per_hundred]

#Advanced:
advanced = player.find("table", attrs= {"class": "stats_table sortable row_summable", "id":"players_advanced"})
advanced_2022 = advanced.find("tr", id="players_advanced.2022").find_all("td")
d_advanced = [float(cell.text) for cell in advanced_2022 if cell["data-stat"] in m_advanced]

#join lists together for each player and then append to DATA 
print('ye')
#return d_per_game + d_per_hundred + d_advanced


# pd.DataFrame([data], columns=metrics, index=['player'])

<tr><th class="left" data-stat="season" scope="row">Career</th><td class="left" data-stat="school_name">Hofstra</td><td class="left iz" data-stat="conf_abbr"></td><td class="right" data-stat="g">3</td><td class="right iz" data-stat="gs">0</td><td class="right" data-stat="mp_per_g">3.0</td><td class="right iz" data-stat="fg_per_g">0.0</td><td class="right" data-stat="fga_per_g">1.0</td><td class="right iz" data-stat="fg_pct">.000</td><td class="right iz" data-stat="fg2_per_g">0.0</td><td class="right" data-stat="fg2a_per_g">0.7</td><td class="right iz" data-stat="fg2_pct">.000</td><td class="right iz" data-stat="fg3_per_g">0.0</td><td class="right" data-stat="fg3a_per_g">0.3</td><td class="right iz" data-stat="fg3_pct">.000</td><td class="right iz" data-stat="ft_per_g">0.0</td><td class="right iz" data-stat="fta_per_g">0.0</td><td class="right iz" data-stat="ft_pct"></td><td class="right iz" data-stat="orb_per_g">0.0</td><td class="right iz" data-stat="drb_per_g">0.0</td><td class="righ

AttributeError: 'NoneType' object has no attribute 'find_all'

In [135]:
#collect data for player into array
m_per_game = ["g", "gs", "mp_per_g", "fg_pct", "pts_per_g", "sos"]
m_per_hundred = ["off_rtg", "def_rtg"]
m_advanced = ["per", "ts_pct", "efg_pct", "usg_pct", "ows", "dws"]
metrics = m_per_game + m_per_hundred + m_advanced


tables = {"players_per_game": m_per_game, "players_per_poss": m_per_hundred, "players_advanced": m_advanced}
def agg(name):
    print(name)
    srcbb = "https://www.sports-reference.com/cbb/players/" + name +"-1.html"
    r_player = requests.get(srcbb)
    player = bs(r_player.content, features="html.parser")
    
    if player.find("table") is None:
        return None
    data = []
    for table, metrics in tables.items():
        desired = player.find("table", attrs= {"class": "stats_table sortable row_summable", "id": table})
        most_recent_year = desired.find("tbody").find_all("tr")[-1].find_all("td")
        data += [cell.text for cell in most_recent_year if cell["data-stat"] in metrics]
        
    return data

[agg(player) for player in players]

# pd.DataFrame([data], columns=metrics, index=['player'])

henry-abraham
kani-acree
daniel-begovich
zion-bethea
troy-boynton
amir-britt
kenny-burns
braelon-bush
tahj-malik-campbell
marsei-caston
chris-childs


AttributeError: 'NoneType' object has no attribute 'find'

In [131]:
#collect data for player into array
m_per_game = ["g", "gs", "mp_per_g", "fg_pct", "pts_per_g", "sos"]
m_per_hundred = ["off_rtg", "def_rtg"]
m_advanced = ["per", "ts_pct", "efg_pct", "usg_pct", "ows", "dws"]
metrics = m_per_game + m_per_hundred + m_advanced


tables = {"players_per_game": m_per_game, "players_per_poss": m_per_hundred, "players_advanced": m_advanced}
srcbb = "https://www.sports-reference.com/cbb/players/" + 'maurice-commander' +"-1.html"
r_player = requests.get(srcbb)
player = bs(r_player.content, features="html.parser")

data = []
for table, metrics in tables.items():
    desired = player.find("table", attrs= {"class": "stats_table sortable row_summable", "id": table})
    most_recent_year = desired.find("tbody").find_all("tr")[-1].find_all("td")
    data += [cell.text for cell in most_recent_year if cell["data-stat"] in metrics]
print(data)




# pd.DataFrame([data], columns=metrics, index=['player'])

['15', '12', '32.2', '.413', '10.5', '-7.42', '109.9', '107.3', '12.4', '.571', '.540', '16.7', '0.8', '0.4']


In [130]:
m_per_game = ["g", "gs", "mp_per_g", "fg_pct", "pts_per_g", "sos"]
m_per_hundred = ["off_rtg", "def_rtg"]
m_advanced = ["per", "ts_pct", "efg_pct", "usg_pct", "ows", "dws"]
metrics = m_per_game + m_per_hundred + m_advanced


tables = {"players_per_game": m_per_game, "players_per_poss": m_per_hundred, "players_advanced": m_advanced}
srcbb = "https://www.sports-reference.com/cbb/players/" + 'maurice-commander' +"-1.html"
r_player = requests.get(srcbb)
player = bs(r_player.content, features="html.parser")

data = []

desired = player.find("table", attrs= {"class": "stats_table sortable row_summable", "id": "players_advanced"})
most_recent_year = desired.find("tbody").find_all("tr")[-1].find_all("td")
print(most_recent_year)
data += [cell.text for cell in most_recent_year if cell["data-stat"] in metrics]
print(data)

[<td class="left" data-stat="school_name"><a href="/cbb/schools/illinois-chicago/2021.html">UIC</a></td>, <td class="left" data-stat="conf_abbr"><a href="/cbb/conferences/horizon/2021.html">Horizon</a></td>, <td class="right" data-stat="g">15</td>, <td class="right" data-stat="gs">12</td>, <td class="right" data-stat="mp">483</td>, <td class="right" data-stat="per">12.4</td>, <td class="right" data-stat="ts_pct">.571</td>, <td class="right" data-stat="efg_pct">.540</td>, <td class="right" data-stat="fg3a_per_fga_pct">.587</td>, <td class="right" data-stat="fta_per_fga_pct">.206</td>, <td class="right" data-stat="pprod">147</td>, <td class="right iz" data-stat="orb_pct">0.0</td>, <td class="right" data-stat="drb_pct">7.6</td>, <td class="right" data-stat="trb_pct">3.8</td>, <td class="right" data-stat="ast_pct">13.6</td>, <td class="right" data-stat="stl_pct">1.3</td>, <td class="right iz" data-stat="blk_pct">0.0</td>, <td class="right" data-stat="tov_pct">12.1</td>, <td class="right" d