In [None]:
import requests
import csv
import json

url = "https://site.web.api.espn.com/apis/common/v3/sports/basketball/nba/statistics/byathlete?region=us&lang=en&contentorigin=espn&isqualified=true&page={page}&limit=50&sort=general.avgMinutes%3Adesc&season={year}&seasontype=2"

In [None]:
resp = requests.get(url.format(year=2023, page=1))
data = resp.json()

In [None]:
with open('./meta.json', 'w') as f:
  f.write(json.dumps(data['categories']))

In [None]:
header = ["year","name", "playerId", "playerSlug", "position", "team", "status"]
header.extend(next(x['names'] for x in data['categories'] if x['name']=='general'))
header.extend(next(x['names'] for x in data['categories'] if x['name']=='offensive'))
header.extend(next(x['names'] for x in data['categories'] if x['name']=='defensive'))

In [None]:
header

['year',
 'name',
 'playerId',
 'playerSlug',
 'position',
 'team',
 'status',
 'gamesPlayed',
 'avgMinutes',
 'avgFouls',
 'flagrantFouls',
 'technicalFouls',
 'ejections',
 'doubleDouble',
 'tripleDouble',
 'minutes',
 'rebounds',
 'fouls',
 'avgRebounds',
 'avgPoints',
 'avgFieldGoalsMade',
 'avgFieldGoalsAttempted',
 'fieldGoalPct',
 'avgThreePointFieldGoalsMade',
 'avgThreePointFieldGoalsAttempted',
 'threePointFieldGoalPct',
 'avgFreeThrowsMade',
 'avgFreeThrowsAttempted',
 'freeThrowPct',
 'avgAssists',
 'avgTurnovers',
 'points',
 'fieldGoalsMade',
 'fieldGoalsAttempted',
 'threePointFieldGoalsMade',
 'threePointFieldGoalsAttempted',
 'freeThrowsMade',
 'freeThrowsAttempted',
 'assists',
 'turnovers',
 'avgSteals',
 'avgBlocks',
 'steals',
 'blocks']

In [None]:
def request_data(year, page):
  resp = requests.get(url.format(year=year, page=page))
  return resp.json()

In [None]:
def get_lines(data: dict, year: int):
  lines = []
  for athlete in data['athletes']:
    line = [year,
            athlete['athlete']['displayName'],
            athlete['athlete']['id'],
            athlete['athlete']['slug'],
            athlete['athlete']['position']['abbreviation'],
            athlete['athlete']['teamShortName'],
            athlete['athlete']['status']['type'] ]
    #alterado para garantir ordem de dados
    #for c in a['categories']:
    #  line.extend([x for x in c['totals']])
    line.extend(next(filter(lambda x: x['name'] == 'general',   athlete['categories']))['values'])
    line.extend(next(filter(lambda x: x['name'] == 'offensive', athlete['categories']))['values'])
    line.extend(next(filter(lambda x: x['name'] == 'defensive', athlete['categories']))['values'])
    lines.append(line)
  return lines

In [None]:
def process_season(from_year: int, to_year: int)-> list:
  page = 1
  year = from_year
  year_target = to_year

  content = []

  while year <= year_target:
    page = 1
    data = request_data(year, page)
    while page <= data['pagination']['pages']:
      data = request_data(year, page)
      lines = get_lines(data, year)
      content.extend(lines)
      page += 1

    year += 1
  return content

In [None]:
%%time
linhas = process_season(1999, 2023)


CPU times: user 6.07 s, sys: 327 ms, total: 6.4 s
Wall time: 2min 20s


In [None]:
def write_csv(header: list, content: list, file_name: str = 'athletes'):
  with open("./{file_name}.csv".format(file_name=file_name), 'w') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(content)


In [None]:
write_csv(header, linhas, "nba_athletes")

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('./nba_athletes.csv')
df[df['playerId']==3202].head()

Unnamed: 0,year,name,playerId,playerSlug,position,team,status,gamesPlayed,avgMinutes,avgFouls,...,threePointFieldGoalsMade,threePointFieldGoalsAttempted,freeThrowsMade,freeThrowsAttempted,assists,turnovers,avgSteals,avgBlocks,steals,blocks
3612,2008,Kevin Durant,3202,kevin-durant,PF,SEA,active,80.0,34.6,1.525,...,59.0,205.0,391.0,448.0,192.0,232.0,0.975,0.9375,78.0,75.0
4006,2009,Kevin Durant,3202,kevin-durant,PF,OKC,active,74.0,38.98649,1.810811,...,97.0,230.0,452.0,524.0,205.0,225.0,1.297297,0.716216,96.0,53.0
4450,2010,Kevin Durant,3202,kevin-durant,PF,OKC,active,82.0,39.5,2.085366,...,128.0,351.0,756.0,840.0,231.0,271.0,1.365854,1.02439,112.0,84.0
4892,2011,Kevin Durant,3202,kevin-durant,PF,OKC,active,78.0,38.94872,2.038461,...,145.0,414.0,594.0,675.0,214.0,218.0,1.128205,0.974359,88.0,76.0
5342,2012,Kevin Durant,3202,kevin-durant,PF,OKC,active,66.0,38.575756,2.015151,...,133.0,344.0,431.0,501.0,231.0,248.0,1.333333,1.166667,88.0,77.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11121 entries, 0 to 11120
Data columns (total 44 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   year                              11121 non-null  int64  
 1   name                              11121 non-null  object 
 2   playerId                          11121 non-null  int64  
 3   playerSlug                        11121 non-null  object 
 4   position                          11111 non-null  object 
 5   team                              11121 non-null  object 
 6   status                            11121 non-null  object 
 7   gamesPlayed                       11121 non-null  float64
 8   avgMinutes                        11121 non-null  float64
 9   avgFouls                          11121 non-null  float64
 10  flagrantFouls                     11121 non-null  float64
 11  technicalFouls                    11121 non-null  float64
 12  ejec

In [None]:
players = pd.DataFrame(df[['playerId', 'playerSlug']].value_counts().reset_index())

In [None]:
for _, p in players.head().iterrows():
  print(p['playerId'], p['playerSlug'])

136 vince-carter
609 dirk-nowitzki
165 jamal-crawford
2184 udonis-haslem
1966 lebron-james


In [None]:
players.shape

(2275, 3)

In [None]:


import requests
from bs4 import BeautifulSoup
import re

format_label = lambda x: re.sub(r'[^a-z]', '',x.strip().lower())

# HT/WT, birthday, college, birthplace, draft info
def crawler_biography(playerId, playerSlug):
    url = "https://www.espn.com/nba/player/bio/_/id/{id}/{slug}".format(id=playerId, slug=playerSlug)
    response = requests.get(url)
    html = response.content
    soup = BeautifulSoup(html, 'html.parser')
    biography_section = soup.find(class_='Card Bio')
    labels = ['playerId','playerSlug']
    values = [playerId, playerSlug]
    items = biography_section.find_all('div', class_='Bio__Item')
    for item in items:
        label_element = item.find('span', class_='Bio__Label')
        value_element = item.find('span', class_='flex-uniform')
        label = format_label(label_element.text)
        value = value_element.text.strip()
        labels.append(label)
        values.append(value)

    return dict(zip(labels, values))



In [None]:
plist = []
for _, p in players.iterrows():
  plist.append(crawler_biography(p['playerId'], p['playerSlug']))

In [None]:
len(plist)

417

In [None]:
plist[1]

{'playerId': 609,
 'playerSlug': 'dirk-nowitzki',
 'position': 'Forward',
 'birthdate': '6/19/1978',
 'draftinfo': '1998: Rd 1, Pk 9 (MIL)',
 'birthplace': 'West Germany'}

In [None]:
import json

with open('atletas.json', 'w') as j:
  j.write(json.dumps(plist))

In [None]:
dfb = pd.read_json('atletas.json')

In [None]:
dfb.shape

(2275, 11)

In [None]:
dfb.to_csv('nba_bios.csv')

In [None]:
merged = pd.merge(dfb, df, on=['playerId', 'playerSlug'])

In [None]:
merged.info()

Join dos Dataframes

In [None]:
import pandas as pd


In [None]:
dfb = pd.read_json('atletas.json')

In [None]:
dfa = pd.read_csv('nba_athletes.csv')

In [None]:
dfb.drop(columns=['status'], inplace=True)

In [None]:
dfa.rename(columns={"team": "teamId", "position": "positionId"}, inplace=True)

In [None]:
dfa.columns

Index(['year', 'name', 'playerId', 'playerSlug', 'positionId', 'teamId',
       'status', 'gamesPlayed', 'avgMinutes', 'avgFouls', 'flagrantFouls',
       'technicalFouls', 'ejections', 'doubleDouble', 'tripleDouble',
       'minutes', 'rebounds', 'fouls', 'avgRebounds', 'avgPoints',
       'avgFieldGoalsMade', 'avgFieldGoalsAttempted', 'fieldGoalPct',
       'avgThreePointFieldGoalsMade', 'avgThreePointFieldGoalsAttempted',
       'threePointFieldGoalPct', 'avgFreeThrowsMade', 'avgFreeThrowsAttempted',
       'freeThrowPct', 'avgAssists', 'avgTurnovers', 'points',
       'fieldGoalsMade', 'fieldGoalsAttempted', 'threePointFieldGoalsMade',
       'threePointFieldGoalsAttempted', 'freeThrowsMade',
       'freeThrowsAttempted', 'assists', 'turnovers', 'avgSteals', 'avgBlocks',
       'steals', 'blocks'],
      dtype='object')

In [None]:
merged = pd.merge(dfa, dfb, on=['playerId', 'playerSlug'])

In [None]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11121 entries, 0 to 11120
Data columns (total 52 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   year                              11121 non-null  int64  
 1   name                              11121 non-null  object 
 2   playerId                          11121 non-null  int64  
 3   playerSlug                        11121 non-null  object 
 4   positionId                        11111 non-null  object 
 5   teamId                            11121 non-null  object 
 6   status                            11121 non-null  object 
 7   gamesPlayed                       11121 non-null  float64
 8   avgMinutes                        11121 non-null  float64
 9   avgFouls                          11121 non-null  float64
 10  flagrantFouls                     11121 non-null  float64
 11  technicalFouls                    11121 non-null  float64
 12  ejec

In [None]:
dfb.head()

Unnamed: 0,playerId,playerSlug,position,birthdate,college,draftinfo,birthplace,team,htwt,experience
0,136,vince-carter,Guard,1/26/1977,North Carolina,"1998: Rd 1, Pk 5 (GS)","Daytona Beach, FL",,,
1,609,dirk-nowitzki,Forward,6/19/1978,,"1998: Rd 1, Pk 9 (MIL)",West Germany,,,
2,165,jamal-crawford,Guard,3/20/1980,Michigan,"2000: Rd 1, Pk 8 (CLE)","Seattle, WA",,,
3,2184,udonis-haslem,Power Forward,6/9/1980 (43),Florida,,"Miami, FL",Miami Heat,"6' 8"", 235 lbs",19th Season
4,1966,lebron-james,Small Forward,12/30/1984 (38),,"2003: Rd 1, Pk 1 (CLE)","Akron, OH",Los Angeles Lakers,"6' 9"", 250 lbs",19th Season


In [None]:
dfs = pd.read_csv('nba_salaries.csv')

In [None]:
dfs.drop(columns=['name','position','team'], inplace=True)

In [None]:
dfs.rename(columns={"ranking": "rankingSalary"}, inplace=True)

In [None]:
dfs

Unnamed: 0,year,rankingSalary,playerId,playerSlug,salary
0,1999,1,614,shaquille-o'neal,"$17,142,000"
1,1999,2,261,kevin-garnett,"$16,806,000"
2,1999,3,580,alonzo-mourning,"$15,004,000"
3,1999,4,351,juwan-howard,"$15,000,000"
4,1999,5,663,scottie-pippen,"$14,795,000"
...,...,...,...,...,...
11308,2023,332,4687718,josh-minott,"$1,719,864"
11309,2023,333,4432582,max-christie,"$1,719,864"
11310,2023,334,4591725,ryan-rollins,"$1,719,864"
11311,2023,335,4868423,jaden-hardy,"$1,719,864"


In [None]:
fullDataSet =  pd.merge(merged, dfs, on=['year', 'playerId', 'playerSlug'])

In [None]:
fullDataSet.to_csv('nba_infos.csv', index=False)