#### Técnica web scraping para conseguir estadísticas de la web de Basketball-Reference, y formar los conjuntos de datos.

In [1]:
# imports necesarios
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

#### URLs de datos

Se van a obtener las estadísticas de la web Basketball-Reference.
Las urls generales con las estadísticas tanto de liga regular como de playoffs de cada temporada y cada jugador son las siguientes:
- https://www.basketball-reference.com/leagues/BAA_1947.html hasta 1949.
- https://www.basketball-reference.com/playoffs/BAA_1947.html hasta 1949.
- https://www.basketball-reference.com/leagues/ABA_1968.html hasta 1976.
- https://www.basketball-reference.com/playoffs/ABA_1968.html hasta 1976.
- https://www.basketball-reference.com/leagues/NBA_1950.html hasta 2020.
- https://www.basketball-reference.com/playoffs/NBA_1950.html hasta 2020.

Dentro de cada url anterior, podríamos obtener las estadísticas tanto por partido, por 36 minutos, estadísticas avanzadas, y por posesión.
Para obtener todas estas estadísticas se usan las siguientes urls:
- https://www.basketball-reference.com/leagues/BAA_1947_per_game.html hasta 1949.
- https://www.basketball-reference.com/leagues/BAA_1947_per_minute.html hasta 1949.
- https://www.basketball-reference.com/leagues/BAA_1947_advanced.html hasta 1949.
- https://www.basketball-reference.com/playoffs/BAA_1947_per_game.html hasta 1949.
- https://www.basketball-reference.com/playoffs/BAA_1947_per_minute.html hasta 1949.
- https://www.basketball-reference.com/playoffs/BAA_1947_advanced.html hasta 1949.
- https://www.basketball-reference.com/leagues/ABA_1968_per_game.html hasta 1976.
- https://www.basketball-reference.com/leagues/ABA_1968_per_minute.html hasta 1976.
- https://www.basketball-reference.com/leagues/ABA_1968_advanced.html hasta 1976.
- https://www.basketball-reference.com/playoffs/ABA_1968_per_game.html hasta 1976.
- https://www.basketball-reference.com/playoffs/ABA_1968_per_minute.html hasta 1976.
- https://www.basketball-reference.com/playoffs/ABA_1968_advanced.html hasta 1976.
- https://www.basketball-reference.com/leagues/NBA_1950_per_game.html hasta 2020.
- https://www.basketball-reference.com/leagues/NBA_1950_per_minute.html hasta 2020.
- https://www.basketball-reference.com/leagues/NBA_1950_advanced.html hasta 2020.
- https://www.basketball-reference.com/leagues/NBA_1974_per_poss.html hasta 2020.
- https://www.basketball-reference.com/playoffs/NBA_1950_per_game.html hasta 2019.
- https://www.basketball-reference.com/playoffs/NBA_1950_per_minute.html hasta 2019.
- https://www.basketball-reference.com/playoffs/NBA_1950_advanced.html hasta 2019.
- https://www.basketball-reference.com/playoffs/NBA_1974_per_poss.html hasta 2019.

In [2]:
# funciones necesarias

# función para obtener de cada url los datos, pasándole el año, la liga, el tipo de estadística y si corresponde
# a liga regular o a los playoffs.
def conseguir_datos(periodo, liga, year, option):
    # uso de la librería para conseguir hacer el web scraping
    url = "https://www.basketball-reference.com/{}/{}_{}_{}.html".format(periodo, liga, year, option)
    html = urlopen(url)
    soup = BeautifulSoup(html)
    # obtenemos las headers de las columnas
    soup.findAll('tr', limit=2)
    # extraemos los textos de las headers
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    # se elimina la primera que no tiene valor
    headers = headers[1:] 
    # sacamos los registros obtenidos, sin tener en cuenta la primera fila que son las headers
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
        for i in range(len(rows))]
    
    # generación del dataframe a partir de los datos obtenidos. Se añade una columna con el año al que corresponde
    # y se eliminan posibles registros o filas del dataframe que tengan el dato del jugador vacío y por tanto
    # no sean registros con valor real.
    df = pd.DataFrame(player_stats, columns = headers)
    df['Year'] = year
    df = df.dropna(subset=['Player'])
    # eliminamos ciertas columnas en dos datasets que vienen vacías
    if option=='per_poss': 
        df = df.drop(df.columns[[28]], axis='columns')
    elif option=='advanced': 
        df = df.drop(df.columns[[18, 23]], axis='columns')
    
    return df


# función para obtener todas las estadísticas de la liga regular en la ABA. Dividimos en funciones según la liga
# sea ABA o NBA, y según las estadísticas sean de la liga regular o de los playoffs.
def datos_ABA_season(year_1, year_2):
    # creamos un diccionario para cada tipo de estadística
    game, minute, advanced = dict(), dict(), dict()
    # bucle for recorremos por año, y se añade a cada diccionario los datos por tipo de estadística y almacenando
    # como key del diccionario el año en cuestión.
    for i in range(year_1,year_2+1):
        datos_game = conseguir_datos('leagues', 'ABA', i, 'per_game')
        datos_minute = conseguir_datos('leagues', 'ABA', i, 'per_minute')
        datos_advanced = conseguir_datos('leagues', 'ABA', i, 'advanced')

        game[i] = datos_game
        minute[i] = datos_minute
        advanced[i] = datos_advanced
        
    return game, minute, advanced


# función para obtener todas las estadísticas de los playoffs de la ABA.
def datos_ABA_playoffs(year_1, year_2):
    po_game, po_minute, po_advanced = dict(), dict(), dict()
    
    for i in range(year_1,year_2+1):
        datos_po_game = conseguir_datos('playoffs', 'ABA', i, 'per_game')
        datos_po_minute = conseguir_datos('playoffs', 'ABA', i, 'per_minute')
        datos_po_advanced = conseguir_datos('playoffs', 'ABA', i, 'advanced')

        po_game[i] = datos_po_game
        po_minute[i] = datos_po_minute
        po_advanced[i] = datos_po_advanced
        
    return po_game, po_minute, po_advanced


# función para obtener todas las estadísticas de la liga regular de la NBA.
def datos_NBA_season(year_1, year_2):
    game, minute, advanced, poss = dict(), dict(), dict(), dict()
    
    for i in range(year_1,year_2+1):
        if i>1973:
            datos_poss = conseguir_datos('leagues', 'NBA', i, 'per_poss')
            
        if i<1950:
            datos_game = conseguir_datos('leagues', 'BAA', i, 'per_game')
            datos_minute = conseguir_datos('leagues', 'BAA', i, 'per_minute')
            datos_advanced = conseguir_datos('leagues', 'BAA', i, 'advanced')
        else:
            datos_game = conseguir_datos('leagues', 'NBA', i, 'per_game')
            datos_minute = conseguir_datos('leagues', 'NBA', i, 'per_minute')
            datos_advanced = conseguir_datos('leagues', 'NBA', i, 'advanced')

        game[i] = datos_game
        minute[i] = datos_minute
        advanced[i] = datos_advanced
        if i>1973: poss[i] = datos_poss
        
    return game, minute, advanced, poss


# función para obtener todas las estadísticas de los playoffs de la NBA.
def datos_NBA_playoffs(year_1, year_2):
    po_game, po_minute, po_advanced, po_poss = dict(), dict(), dict(), dict()
    
    for i in range(year_1,year_2+1):
        if i>1973:
            datos_po_poss = conseguir_datos('playoffs', 'NBA', i, 'per_poss')
            
        if i<1950:
            datos_po_game = conseguir_datos('playoffs', 'BAA', i, 'per_game')
            datos_po_minute = conseguir_datos('playoffs', 'BAA', i, 'per_minute')
            datos_po_advanced = conseguir_datos('playoffs', 'BAA', i, 'advanced')
        else:
            datos_po_game = conseguir_datos('playoffs', 'NBA', i, 'per_game')
            datos_po_minute = conseguir_datos('playoffs', 'NBA', i, 'per_minute')
            datos_po_advanced = conseguir_datos('playoffs', 'NBA', i, 'advanced')

        po_game[i] = datos_po_game
        po_minute[i] = datos_po_minute
        po_advanced[i] = datos_po_advanced
        if i>1973: po_poss[i] = datos_po_poss
        
    return po_game, po_minute, po_advanced, po_poss


# función que genera un fichero csv por cada dataframe almacenado en el diccionario que le pasamos
def generar_csv(diccionario, liga, opcion):
    # en el nombre de cada csv debe meterse el año para diferenciarlos
    for key in diccionario:
        key_str = str(key)
        diccionario[key].to_csv('./Datasets/csv/'+liga+'_'+key_str+'_'+opcion+'.csv', index=False)


# función que genera un fichero json por cada dataframe almacenado en el diccionario que le pasamos
def generar_json(diccionario, liga, opcion):
    # en el nombre de cada json debe meterse el año para diferenciarlos
    for key in diccionario:
        key_str = str(key)
        diccionario[key].to_json('./Datasets/json/'+liga+'_'+key_str+'_'+opcion+'.json',\
                                 orient='table', index=False)


# función que genera un fichero excel por cada dataframe almacenado en el diccionario que le pasamos
def generar_excel(diccionario, liga, opcion):
    # en el nombre de cada excel debe meterse el año para diferenciarlos
    for key in diccionario:
        key_str = str(key)
        diccionario[key].to_excel('./Datasets/excel/'+liga+'_'+key_str+'_'+opcion+'.xlsx', index=False)


# función que concatena todos los dataframes generados con datos de la NBA, obteniendo un dataframe final con los
# datos para regular season y otro para playoffs, por cada uno de los tipos de estadística. A partir de estos
# dataframes generales, con todos los años, podremos hacer distintas particiones generando otros por ejemplo por 
# cada uno de los equipos, si quisieramos estudiar solo las estadísticas de un equipo en particular.
def concatenar_dataframes(diccionario, liga, opcion):
    # recorremos el diccionario y guardamos cada dataframe para concatenarlos todos en uno
    dataframes = []
    for key in diccionario:
        dataframes.append(diccionario[key])

    df_result = pd.concat(dataframes)

    return df_result


# función para generar los ficheros en formato cvs, excel y json de los datasets totales de todas las temporadas
def generar_ficheros_totales(lista_dfs, liga, opcion):
    # recorremos la lista de dataframes para ir generando cada uno de los ficheros
    i=0
    for df in lista_dfs:
        df.to_csv('./Datasets/csv/'+liga+'_'+opcion[i]+'.csv', index=False)
        df.to_json('./Datasets/json/'+liga+'_'+opcion[i]+'.json', orient='table', index=False)
        df.to_excel('./Datasets/excel/'+liga+'_'+opcion[i]+'.xlsx', index=False)
        i+=1


# función que genera y almacena los datasets por época: sin reloj de posesión y con reloj, sin línea de 3 y con 
# tiros de 3 puntos.
def generar_dataframes_epocas(lista_dfs):
    for df in lista_dfs:
        df_noreloj = df[df['Year'] < 1955]
        df_reloj = df[df['Year'] > 1954]
        df_no3pts = df[df['Year'] < 1980]
        df_3pts = df[df['Year'] > 1979]
        
        lista_dataframes = (df_noreloj, df_reloj, df_no3pts, df_3pts)
        lista_option = ('no_reloj', 'reloj', 'no_3pts', '3pts')
        
        generar_ficheros_epocas(lista_dataframes, lista_option)

        
# función que guarda dataframes en ficheros csv, json y excel
def generar_ficheros_epocas(lista_dataframes, lista_option):
    # recorremos la lista de dataframes para ir generando cada uno de los ficheros
    i=0
    for df in lista_dataframes:
        df.to_csv('./Datasets/csv/NBA_'+lista_option[i]+'.csv', index=False)
        df.to_json('./Datasets/json/NBA_'+lista_option[i]+'.json', orient='table', index=False)
        df.to_excel('./Datasets/excel/NBA_'+lista_option[i]+'.xlsx', index=False)
        i+=1 


# función que genera y guarda los datasets por el equipo que se le pase, solo con los datos de ese equipo.
def generar_dataframes_team(lista_dfs, team, option):
    # recorremos la lista de dataframes para ir generando cada uno de los ficheros
    i=0
    for df in lista_dfs:
        df = df[df['Tm']==team]
        
        df.to_csv('./Datasets/csv/'+team+'_'+option[i]+'.csv', index=False)
        df.to_json('./Datasets/json/'+team+'_'+option[i]+'.json', orient='table', index=False)
        df.to_excel('./Datasets/excel/'+team+'_'+option[i]+'.xlsx', index=False)
        i+=1   

        
# función que genera y guarda todos los datasets, por todos los equipos que haya habido en la NBA.
def generar_dataframes_teams(lista_dfs, lista_teams, option):
    # recorremos la lista de equipos y por cada uno la lista de dataframes
    for team in lista_teams:
        i=0
        
        for df in lista_dfs:
            df = df[df['Tm']==team]
        
            df.to_csv('./Datasets/csv/'+team+'_'+option[i]+'.csv', index=False)
            df.to_json('./Datasets/json/'+team+'_'+option[i]+'.json', orient='table', index=False)
            df.to_excel('./Datasets/excel/'+team+'_'+option[i]+'.xlsx', index=False)
            i+=1

In [5]:
# obtención de los datos de liga regular de la ABA
aba_game, aba_minute, aba_advanced = datos_ABA_season(1968, 1976)

In [6]:
# obtención de los datos de los playoffs de la ABA
aba_po_game, aba_po_minute, aba_po_advanced = datos_ABA_playoffs(1968, 1976)

In [7]:
# obtención de los datos de la liga regular de la NBA
nba_game, nba_minute, nba_advanced, nba_poss = datos_NBA_season(1947, 2020)

In [9]:
# obtención de los datos de los playoffs de la NBA
nba_po_game, nba_po_minute, nba_po_advanced, nba_po_poss = datos_NBA_playoffs(1947, 2019)

In [10]:
# generar csv datos ABA
# Liga regular
generar_csv(aba_game, 'ABA', 'game')
generar_csv(aba_minute, 'ABA', 'minute')
generar_csv(aba_advanced, 'ABA', 'advanced')
# playoffs
generar_csv(aba_po_game, 'ABA', 'po_game')
generar_csv(aba_po_minute, 'ABA', 'po_minute')
generar_csv(aba_po_advanced, 'ABA', 'po_advanced')

In [11]:
# generar csv datos NBA
# liga regular
generar_csv(nba_game, 'NBA', 'game')
generar_csv(nba_minute, 'NBA', 'minute')
generar_csv(nba_advanced, 'NBA', 'advanced')
generar_csv(nba_poss, 'NBA', 'poss')
# playoffs
generar_csv(nba_po_game, 'NBA', 'po_game')
generar_csv(nba_po_minute, 'NBA', 'po_minute')
generar_csv(nba_po_advanced, 'NBA', 'po_advanced')
generar_csv(nba_po_poss, 'NBA', 'po_poss')

In [12]:
# generar excel datos ABA
# liga regular
generar_excel(aba_game, 'ABA', 'game')
generar_excel(aba_minute, 'ABA', 'minute')
generar_excel(aba_advanced, 'ABA', 'advanced')
# playoffs
generar_excel(aba_po_game, 'ABA', 'po_game')
generar_excel(aba_po_minute, 'ABA', 'po_minute')
generar_excel(aba_po_advanced, 'ABA', 'po_advanced')

In [13]:
# generar excel datos NBA
# liga regular
generar_excel(nba_game, 'NBA', 'game')
generar_excel(nba_minute, 'NBA', 'minute')
generar_excel(nba_advanced, 'NBA', 'advanced')
generar_excel(nba_poss, 'NBA', 'poss')
# playoffs
generar_excel(nba_po_game, 'NBA', 'po_game')
generar_excel(nba_po_minute, 'NBA', 'po_minute')
generar_excel(nba_po_advanced, 'NBA', 'po_advanced')
generar_excel(nba_po_poss, 'NBA', 'po_poss')

In [14]:
# generar json datos ABA
# liga regular
generar_json(aba_game, 'ABA', 'game')
generar_json(aba_minute, 'ABA', 'minute')
generar_json(aba_advanced, 'ABA', 'advanced')
# playoffs
generar_json(aba_po_game, 'ABA', 'po_game')
generar_json(aba_po_minute, 'ABA', 'po_minute')
generar_json(aba_po_advanced, 'ABA', 'po_advanced')

In [15]:
# generar json datos NBA
# liga regular
generar_json(nba_game, 'NBA', 'game')
generar_json(nba_minute, 'NBA', 'minute')
generar_json(nba_advanced, 'NBA', 'advanced')
generar_json(nba_poss, 'NBA', 'poss')
# playoffs
generar_json(nba_po_game, 'NBA', 'po_game')
generar_json(nba_po_minute, 'NBA', 'po_minute')
generar_json(nba_po_advanced, 'NBA', 'po_advanced')
generar_json(nba_po_poss, 'NBA', 'po_poss')

In [16]:
# listar directorios para comprobar los ficheros generados
from os import scandir, getcwd

def ls(ruta = getcwd()):
    return [arch.name for arch in scandir(ruta) if arch.is_file()]

ls('./Datasets/csv/')

['NBA_2014_po_game.csv',
 'NBA_2000_minute.csv',
 'NBA_1988_po_advanced.csv',
 'NBA_1953_po_minute.csv',
 'NBA_1964_po_game.csv',
 'NBA_1999_po_game.csv',
 'NBA_2011_po_advanced.csv',
 'NBA_1994_minute.csv',
 'NBA_2019_advanced.csv',
 'NBA_2002_game.csv',
 'NBA_2003_game.csv',
 'NBA_1967_game.csv',
 'NBA_1969_po_advanced.csv',
 'NBA_1966_game.csv',
 'NBA_1954_po_advanced.csv',
 'NBA_2020_poss.csv',
 'NBA_1980_po_poss.csv',
 'NBA_1957_po_minute.csv',
 'NBA_1970_po_advanced.csv',
 'NBA_1982_game.csv',
 'NBA_1983_game.csv',
 'NBA_1956_advanced.csv',
 'NBA_1985_po_game.csv',
 'NBA_1978_po_game.csv',
 'NBA_1977_minute.csv',
 'NBA_2008_po_advanced.csv',
 'NBA_1951_advanced.csv',
 'NBA_1988_minute.csv',
 'NBA_1991_po_advanced.csv',
 'NBA_2008_po_game.csv',
 'NBA_1956_po_game.csv',
 'ABA_1976_po_advanced.csv',
 'NBA_2011_po_poss.csv',
 'ABA_1973_po_game.csv',
 'NBA_1979_po_minute.csv',
 'NBA_1959_game.csv',
 'NBA_1958_game.csv',
 'NBA_2007_po_advanced.csv',
 'NBA_2006_po_minute.csv',
 'NBA_196

In [17]:
# cargar y visualizar ejemplo csv
df_csv = pd.read_csv('./Datasets/csv/NBA_2019_game.csv', sep=',')
print("Dataset NBA 2019 per game: num_rows: %d\tColumnas: %d\n" % (df_csv.shape[0], df_csv.shape[1]))

Dataset NBA 2019 per game: num_rows: 708	Columnas: 30



In [18]:
df_csv.head(10)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,Álex Abrines,SG,25,OKC,31,2,19.0,1.8,5.1,0.357,...,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3,2019
1,Quincy Acy,PF,28,PHO,10,0,12.3,0.4,1.8,0.222,...,0.3,2.2,2.5,0.8,0.1,0.4,0.4,2.4,1.7,2019
2,Jaylen Adams,PG,22,ATL,34,1,12.6,1.1,3.2,0.345,...,0.3,1.4,1.8,1.9,0.4,0.1,0.8,1.3,3.2,2019
3,Steven Adams,C,25,OKC,80,80,33.4,6.0,10.1,0.595,...,4.9,4.6,9.5,1.6,1.5,1.0,1.7,2.6,13.9,2019
4,Bam Adebayo,C,21,MIA,82,28,23.3,3.4,5.9,0.576,...,2.0,5.3,7.3,2.2,0.9,0.8,1.5,2.5,8.9,2019
5,Deng Adel,SF,21,CLE,19,3,10.2,0.6,1.9,0.306,...,0.2,0.8,1.0,0.3,0.1,0.2,0.3,0.7,1.7,2019
6,DeVaughn Akoon-Purcell,SG,25,DEN,7,0,3.1,0.4,1.4,0.3,...,0.1,0.4,0.6,0.9,0.3,0.0,0.3,0.6,1.0,2019
7,LaMarcus Aldridge,C,33,SAS,81,81,33.2,8.4,16.3,0.519,...,3.1,6.1,9.2,2.4,0.5,1.3,1.8,2.2,21.3,2019
8,Rawle Alkins,SG,21,CHI,10,1,12.0,1.3,3.9,0.333,...,1.1,1.5,2.6,1.3,0.1,0.0,0.8,0.7,3.7,2019
9,Grayson Allen,SG,23,UTA,38,2,10.9,1.8,4.7,0.376,...,0.1,0.5,0.6,0.7,0.2,0.2,0.9,1.2,5.6,2019


In [19]:
# cargar y visualizar ejemplo excel
df_excel = pd.read_excel('./Datasets/excel/NBA_2018_game.xlsx')

In [20]:
df_excel.head(10)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,Álex Abrines,SG,24,OKC,75,8,15.1,1.5,3.9,0.395,...,0.3,1.2,1.5,0.4,0.5,0.1,0.3,1.7,4.7,2018
1,Quincy Acy,PF,27,BRK,70,8,19.4,1.9,5.2,0.356,...,0.6,3.1,3.7,0.8,0.5,0.4,0.9,2.1,5.9,2018
2,Steven Adams,C,24,OKC,76,76,32.7,5.9,9.4,0.629,...,5.1,4.0,9.0,1.2,1.2,1.0,1.7,2.8,13.9,2018
3,Bam Adebayo,C,20,MIA,69,19,19.8,2.5,4.9,0.512,...,1.7,3.8,5.5,1.5,0.5,0.6,1.0,2.0,6.9,2018
4,Arron Afflalo,SG,32,ORL,53,3,12.9,1.2,3.1,0.401,...,0.1,1.2,1.2,0.6,0.1,0.2,0.4,1.1,3.4,2018
5,Cole Aldrich,C,29,MIN,21,0,2.3,0.2,0.7,0.333,...,0.1,0.6,0.7,0.1,0.1,0.0,0.0,0.5,0.6,2018
6,LaMarcus Aldridge,C,32,SAS,75,75,33.5,9.2,18.0,0.51,...,3.3,5.2,8.5,2.0,0.6,1.2,1.5,2.1,23.1,2018
7,Jarrett Allen,C,19,BRK,72,31,20.0,3.3,5.5,0.589,...,2.0,3.4,5.4,0.7,0.4,1.2,1.1,2.0,8.2,2018
8,Kadeem Allen,PG,25,BOS,18,1,5.9,0.3,1.2,0.273,...,0.2,0.4,0.6,0.7,0.2,0.1,0.5,0.8,1.1,2018
9,Tony Allen,SF,36,NOP,22,0,12.4,2.0,4.1,0.484,...,0.9,1.2,2.1,0.4,0.5,0.1,0.9,2.2,4.7,2018


In [21]:
# cargar y visualizar ejemplo json
df_json = pd.read_json('./Datasets/json/NBA_2017_game.json', orient='table')

In [22]:
df_json.head(10)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,Álex Abrines,SG,23,OKC,68,6,15.5,2.0,5.0,0.393,...,0.3,1.0,1.3,0.6,0.5,0.1,0.5,1.7,6.0,2017
1,Quincy Acy,PF,26,TOT,38,1,14.7,1.8,4.5,0.412,...,0.5,2.5,3.0,0.5,0.4,0.4,0.6,1.8,5.8,2017
2,Quincy Acy,PF,26,DAL,6,0,8.0,0.8,2.8,0.294,...,0.3,1.0,1.3,0.0,0.0,0.0,0.3,1.5,2.2,2017
3,Quincy Acy,PF,26,BRK,32,1,15.9,2.0,4.8,0.425,...,0.6,2.8,3.3,0.6,0.4,0.5,0.6,1.8,6.5,2017
4,Steven Adams,C,23,OKC,80,80,29.9,4.7,8.2,0.571,...,3.5,4.2,7.7,1.1,1.1,1.0,1.8,2.4,11.3,2017
5,Arron Afflalo,SG,31,SAC,61,45,25.9,3.0,6.9,0.44,...,0.1,1.9,2.0,1.3,0.3,0.1,0.7,1.7,8.4,2017
6,Alexis Ajinça,C,28,NOP,39,15,15.0,2.3,4.6,0.5,...,1.2,3.4,4.5,0.3,0.5,0.6,0.8,2.0,5.3,2017
7,Cole Aldrich,C,28,MIN,62,0,8.6,0.7,1.4,0.523,...,0.8,1.7,2.5,0.4,0.4,0.4,0.3,1.4,1.7,2017
8,LaMarcus Aldridge,PF,31,SAS,72,72,32.4,6.9,14.6,0.477,...,2.4,4.9,7.3,1.9,0.6,1.2,1.4,2.2,17.3,2017
9,Lavoy Allen,PF,27,IND,61,5,14.3,1.3,2.8,0.458,...,1.7,1.9,3.6,0.9,0.3,0.4,0.5,1.3,2.9,2017


#### Ver columnas que tiene un dataset por cada tipo de estadística: per game, 36 minutes, advanced y por posesión.

In [23]:
# cargamos un ejemplo de cada tipo, y vemos sus columnas
df_csv_games = pd.read_csv('./Datasets/csv/NBA_2019_game.csv', sep=',')
df_csv_minutes = pd.read_csv('./Datasets/csv/NBA_2019_minute.csv', sep=',')
df_csv_advanced = pd.read_csv('./Datasets/csv/NBA_2019_advanced.csv', sep=',')
df_csv_posesion = pd.read_csv('./Datasets/csv/NBA_2019_poss.csv', sep=',')

print(df_csv_games.columns)
df_csv_games.head(10)

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year'],
      dtype='object')


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,Álex Abrines,SG,25,OKC,31,2,19.0,1.8,5.1,0.357,...,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3,2019
1,Quincy Acy,PF,28,PHO,10,0,12.3,0.4,1.8,0.222,...,0.3,2.2,2.5,0.8,0.1,0.4,0.4,2.4,1.7,2019
2,Jaylen Adams,PG,22,ATL,34,1,12.6,1.1,3.2,0.345,...,0.3,1.4,1.8,1.9,0.4,0.1,0.8,1.3,3.2,2019
3,Steven Adams,C,25,OKC,80,80,33.4,6.0,10.1,0.595,...,4.9,4.6,9.5,1.6,1.5,1.0,1.7,2.6,13.9,2019
4,Bam Adebayo,C,21,MIA,82,28,23.3,3.4,5.9,0.576,...,2.0,5.3,7.3,2.2,0.9,0.8,1.5,2.5,8.9,2019
5,Deng Adel,SF,21,CLE,19,3,10.2,0.6,1.9,0.306,...,0.2,0.8,1.0,0.3,0.1,0.2,0.3,0.7,1.7,2019
6,DeVaughn Akoon-Purcell,SG,25,DEN,7,0,3.1,0.4,1.4,0.3,...,0.1,0.4,0.6,0.9,0.3,0.0,0.3,0.6,1.0,2019
7,LaMarcus Aldridge,C,33,SAS,81,81,33.2,8.4,16.3,0.519,...,3.1,6.1,9.2,2.4,0.5,1.3,1.8,2.2,21.3,2019
8,Rawle Alkins,SG,21,CHI,10,1,12.0,1.3,3.9,0.333,...,1.1,1.5,2.6,1.3,0.1,0.0,0.8,0.7,3.7,2019
9,Grayson Allen,SG,23,UTA,38,2,10.9,1.8,4.7,0.376,...,0.1,0.5,0.6,0.7,0.2,0.2,0.9,1.2,5.6,2019


In [24]:
print(df_csv_minutes.columns)
df_csv_minutes.head(10)

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year'],
      dtype='object')


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,Álex Abrines,SG,25,OKC,31,2,588,3.4,9.6,0.357,...,0.3,2.6,2.9,1.2,1.0,0.4,0.9,3.2,10.1,2019
1,Quincy Acy,PF,28,PHO,10,0,123,1.2,5.3,0.222,...,0.9,6.4,7.3,2.3,0.3,1.2,1.2,7.0,5.0,2019
2,Jaylen Adams,PG,22,ATL,34,1,428,3.2,9.3,0.345,...,0.9,4.1,5.0,5.5,1.2,0.4,2.4,3.8,9.1,2019
3,Steven Adams,C,25,OKC,80,80,2669,6.5,10.9,0.595,...,5.3,5.0,10.3,1.7,1.6,1.0,1.8,2.8,14.9,2019
4,Bam Adebayo,C,21,MIA,82,28,1913,5.3,9.1,0.576,...,3.1,8.1,11.2,3.5,1.3,1.2,2.3,3.8,13.7,2019
5,Deng Adel,SF,21,CLE,19,3,194,2.0,6.7,0.306,...,0.6,3.0,3.5,0.9,0.2,0.7,1.1,2.4,5.9,2019
6,DeVaughn Akoon-Purcell,SG,25,DEN,7,0,22,4.9,16.4,0.3,...,1.6,4.9,6.5,9.8,3.3,0.0,3.3,6.5,11.5,2019
7,LaMarcus Aldridge,C,33,SAS,81,81,2687,9.2,17.7,0.519,...,3.4,6.6,10.0,2.6,0.6,1.4,1.9,2.4,23.1,2019
8,Rawle Alkins,SG,21,CHI,10,1,120,3.9,11.7,0.333,...,3.3,4.5,7.8,3.9,0.3,0.0,2.4,2.1,11.1,2019
9,Grayson Allen,SG,23,UTA,38,2,416,5.8,15.4,0.376,...,0.3,1.7,2.0,2.2,0.5,0.5,2.9,4.1,18.3,2019


In [25]:
print(df_csv_advanced.columns)
df_csv_advanced.head(10)

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr',
       'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS',
       'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP', 'Year'],
      dtype='object')


Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Year
0,Álex Abrines,SG,25,OKC,31,588,6.3,0.507,0.809,0.083,...,12.2,0.1,0.6,0.6,0.053,-2.4,-0.9,-3.4,-0.2,2019
1,Quincy Acy,PF,28,PHO,10,123,2.9,0.379,0.833,0.556,...,9.2,-0.1,0.0,-0.1,-0.022,-5.7,-0.3,-5.9,-0.1,2019
2,Jaylen Adams,PG,22,ATL,34,428,7.6,0.474,0.673,0.082,...,13.5,-0.1,0.2,0.1,0.011,-3.1,-1.3,-4.4,-0.3,2019
3,Steven Adams,C,25,OKC,80,2669,18.5,0.591,0.002,0.361,...,16.4,5.1,4.0,9.1,0.163,0.6,2.1,2.7,3.2,2019
4,Bam Adebayo,C,21,MIA,82,1913,17.9,0.623,0.031,0.465,...,15.8,3.4,3.4,6.8,0.171,-0.6,3.6,3.0,2.4,2019
5,Deng Adel,SF,21,CLE,19,194,2.7,0.424,0.639,0.111,...,9.9,-0.2,0.0,-0.2,-0.054,-5.3,-2.0,-7.3,-0.3,2019
6,DeVaughn Akoon-Purcell,SG,25,DEN,7,22,8.2,0.322,0.4,0.2,...,25.0,-0.1,0.0,0.0,-0.051,-4.4,0.8,-3.6,0.0,2019
7,LaMarcus Aldridge,C,33,SAS,81,2687,22.9,0.576,0.032,0.312,...,26.9,6.4,2.9,9.3,0.167,1.2,0.5,1.6,2.5,2019
8,Rawle Alkins,SG,21,CHI,10,120,8.1,0.418,0.308,0.308,...,19.0,-0.1,0.0,-0.1,-0.042,-4.1,-2.5,-6.5,-0.1,2019
9,Grayson Allen,SG,23,UTA,38,416,7.5,0.516,0.556,0.337,...,24.4,-0.4,0.4,0.0,0.002,-3.7,-3.1,-6.8,-0.5,2019


In [26]:
print(df_csv_posesion.columns)
df_csv_posesion.head(10)

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB',
       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'ORtg', 'DRtg', 'Year'],
      dtype='object')


Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,TRB,AST,STL,BLK,TOV,PF,PTS,ORtg,DRtg,Year
0,Álex Abrines,SG,25,OKC,31,2,588,4.4,12.5,0.357,...,3.8,1.6,1.3,0.5,1.1,4.2,13.1,103.0,111,2019
1,Quincy Acy,PF,28,PHO,10,0,123,1.6,7.0,0.222,...,9.7,3.1,0.4,1.6,1.6,9.3,6.6,87.0,116,2019
2,Jaylen Adams,PG,22,ATL,34,1,428,4.1,11.9,0.345,...,6.5,7.0,1.5,0.5,3.0,4.9,11.7,99.0,115,2019
3,Steven Adams,C,25,OKC,80,80,2669,8.4,14.1,0.595,...,13.3,2.2,2.0,1.3,2.4,3.6,19.4,120.0,106,2019
4,Bam Adebayo,C,21,MIA,82,28,1913,7.2,12.4,0.576,...,15.2,4.7,1.8,1.7,3.1,5.2,18.6,120.0,104,2019
5,Deng Adel,SF,21,CLE,19,3,194,2.8,9.2,0.306,...,4.9,1.3,0.3,1.0,1.5,3.3,8.2,85.0,121,2019
6,DeVaughn Akoon-Purcell,SG,25,DEN,7,0,22,6.7,22.3,0.3,...,8.9,13.4,4.5,0.0,4.5,8.9,15.6,84.0,104,2019
7,LaMarcus Aldridge,C,33,SAS,81,81,2687,12.4,24.0,0.519,...,13.5,3.5,0.8,1.9,2.6,3.3,31.4,117.0,110,2019
8,Rawle Alkins,SG,21,CHI,10,1,120,5.3,15.8,0.333,...,10.5,5.3,0.4,0.0,3.2,2.8,15.0,93.0,117,2019
9,Grayson Allen,SG,23,UTA,38,2,416,7.7,20.5,0.376,...,2.6,2.9,0.7,0.7,3.8,5.4,24.3,95.0,111,2019


###### Datasets estadísticas por partido
- Pos -- Position
- Age -- Age of Player at the start of February 1st of that season.
- Tm -- Team
- G -- Games
- GS -- Games Started
- MP -- Minutes Played Per Game
- FG -- Field Goals Per Game
- FGA -- Field Goal Attempts Per Game
- FG% -- Field Goal Percentage
- 3P -- 3-Point Field Goals Per Game
- 3PA -- 3-Point Field Goal Attempts Per Game
- 3P% -- 3-Point Field Goal Percentage
- 2P -- 2-Point Field Goals Per Game
- 2PA -- 2-Point Field Goal Attempts Per Game
- 2P% -- 2-Point Field Goal Percentage
- eFG% -- Effective Field Goal Percentage
- FT -- Free Throws Per Game
- FTA -- Free Throw Attempts Per Game
- FT% -- Free Throw Percentage
- ORB -- Offensive Rebounds Per Game
- DRB -- Defensive Rebounds Per Game
- TRB -- Total Rebounds Per Game
- AST -- Assists Per Game
- STL -- Steals Per Game
- BLK -- Blocks Per Game
- TOV -- Turnovers Per Game
- PF -- Personal Fouls Per Game
- PTS -- Points Per Game


###### Datasets estadísticas por 36 minutos
- Pos -- Position
- Age -- Age of Player at the start of February 1st of that season.
- Tm -- Team
- G -- Games
- GS -- Games Started
- MP -- Minutes Played
- FG -- Field Goals Per 36 Minutes
- FGA -- Field Goal Attempts Per 36 Minutes
- FG% -- Field Goal Percentage
- 3P -- 3-Point Field Goals Per 36 Minutes
- 3PA -- 3-Point Field Goal Attempts Per 36 Minutes
- 3P% -- 3-Point Field Goal Percentage
- 2P -- 2-Point Field Goals Per 36 Minutes
- 2PA -- 2-Point Field Goal Attempts Per 36 Minutes
- 2P% -- 2-Point Field Goal Percentage
- FT -- Free Throws Per 36 Minutes
- FTA -- Free Throw Attempts Per 36 Minutes
- FT% -- Free Throw Percentage
- ORB -- Offensive Rebounds Per 36 Minutes
- DRB -- Defensive Rebounds Per 36 Minutes
- TRB -- Total Rebounds Per 36 Minutes
- AST -- Assists Per 36 Minutes
- STL -- Steals Per 36 Minutes
- BLK -- Blocks Per 36 Minutes
- TOV -- Turnovers Per 36 Minutes
- PF -- Personal Fouls Per 36 Minutes
- PTS -- Points Per 36 Minutes


###### Datasets estadísticas avanzadas
- Pos -- Position
- Age -- Age of Player at the start of February 1st of that season.
- Tm -- Team
- G -- Games
- MP -- Minutes Played
- PER -- Player Efficiency Rating. Una medida de producción por minuto estandarizada de modo que el promedio de la liga sea 15.
- TS% -- True Shooting Percentage. Porcentaje de tiro real. Una medida de eficiencia de tiro que tiene en cuenta los tiros de 2 puntos, los tiros de 3 puntos y los tiros libres.
- 3PAr -- 3-Point Attempt Rate. Porcentaje de intentos de FG desde el rango de 3 puntos.
- FTr -- Free Throw Attempt Rate. Número de intentos de FT por intento de FG.
- ORB% -- Offensive Rebound Percentage. Porcentaje de rebote ofensivo. Una estimación del porcentaje de rebotes ofensivos disponibles que un jugador agarró mientras estaba jugando.
- DRB% -- Defensive Rebound Percentage. Porcentaje de rebote defensivo. Una estimación del porcentaje de rebotes defensivos disponibles que un jugador agarró mientras estaba jugando.
- TRB% -- Total Rebound Percentage. Porcentaje de rebote total. Una estimación del porcentaje de rebotes disponibles que un jugador agarró mientras estaba jugando.
- AST% -- Assist Percentage. Porcentaje de asistencia. Una estimación del porcentaje de canastas de compañeros de equipo a los que ayudó un jugador mientras estaba jugando.
- STL% -- Steal Percentage. Porcentaje de robo. Una estimación del porcentaje de posesiones del oponente que termina con un robo por parte del jugador mientras estaba jugando.
- BLK% -- Block Percentage. Porcentaje de bloqueo. Una estimación del porcentaje de intentos de tiros de campo de dos puntos del oponente bloqueados por el jugador mientras estaba jugando.
- TOV% -- Turnover Percentage. Porcentaje de pérdidas. Una estimación de pérdidas de balón cometidas por cada 100 jugadas.
- USG% -- Usage Percentage. Porcentaje de uso. Una estimación del porcentaje de jugadas de equipo utilizadas por un jugador mientras estaba jugando.
- OWS -- Offensive Win Shares. Acciones ofensivas de victoria. Una estimación del número de victorias aportadas por un jugador debido a su ataque.
- DWS -- Defensive Win Shares. Acciones de victoria defensivas. Una estimación del número de victorias aportadas por un jugador debido a su defensa.
- WS -- Win Shares. Una estimación del número de victorias aportadas por un jugador.
- WS/48 -- Win Shares Per 48 Minutes. Una estimación del número de victorias aportadas por un jugador por 48 minutos (el promedio de la liga es aproximadamente .100)
- OBPM -- Offensive Box Plus/Minus. Una estimación de puntuación mas/menos de los puntos ofensivos por 100 posesiones que un jugador contribuyó por encima de un jugador promedio de la liga, traducido a un equipo promedio.
- DBPM -- Defensive Box Plus/Minus. Una estimación de puntuación mas/menos de los puntos defensivos por 100 posesiones que un jugador contribuyó por encima de un jugador promedio de la liga, traducido a un equipo promedio.
- BPM -- Box Plus/Minus. Una estimación de puntuación mas/menos de los puntos por 100 posesiones que un jugador contribuyó por encima de un jugador promedio de la liga, traducido a un equipo promedio.
- VORP -- Value over Replacement Player. Valor sobre el jugador de reemplazo. Una estimación de puntuación mas/menos de los puntos por 100 posesiones de EQUIPO que un jugador contribuyó por encima de un jugador de nivel de reemplazo (-2.0), traducido a un equipo promedio y prorrateado a una temporada de 82 juegos. Multiplique por 2.70 para convertir a victorias sobre reemplazo.


###### Datasets estadísticas por posesión
- Pos -- Position
- Age -- Age of Player at the start of February 1st of that season.
- Tm -- Team
- G -- Games
- GS -- Games Started
- MP -- Minutes Played
- FG -- Field Goals Per 100 Team Possessions
- FGA -- Field Goal Attempts Per 100 Team Possessions
- FG% -- Field Goal Percentage
- 3P -- 3-Point Field Goals Per 100 Team Possesssions
- 3PA -- 3-Point Field Goal Attempts Per 100 Team Possessions
- 3P% -- 3-Point Field Goal Percentage
- 2P -- 2-Point Field Goals Per 100 Team Possessions
- 2PA -- 2-Point Field Goal Attempts Per 100 Team Possessions
- 2P% -- 2-Point Field Goal Percentage
- FT -- Free Throws Per 100 Team Possessions
- FTA -- Free Throw Attempts Per 100 Team Possessions
- FT% -- Free Throw Percentage
- ORB -- Offensive Rebounds Per 100 Team Possessions
- DRB -- Defensive Rebounds Per 100 Team Possessions
- TRB -- Total Rebounds Per 100 Team Possessions
- AST -- Assists Per 100 Team Possessions
- STL -- Steals Per 100 Team Possessions
- BLK -- Blocks Per 100 Team Possessions
- TOV -- Turnovers Per 100 Team Possessions
- PF -- Personal Fouls Per 100 Team Possessions
- PTS -- Points Per 100 Team Possessions
- ORtg -- Offensive Rating. An estimate of points produced (players) or scored (teams) per 100 possessions
- DRtg -- Defensive Rating. An estimate of points allowed per 100 possessions

#### Concatenar dataframes en unos generales, y posteriormente generar unos por cada equipo.

In [27]:
# concatenar todos los datos de la ABA, por temporada regular o playoffs, y por tipo de datos, para luego generar
# unos ficheros totales, y no tener separados los datos por año.
df_ABA_season_game = concatenar_dataframes(aba_game, 'ABA', 'game')
df_ABA_season_minute = concatenar_dataframes(aba_minute, 'ABA', 'minute')
df_ABA_season_advanced = concatenar_dataframes(aba_advanced, 'ABA', 'advanced')

df_ABA_playoffs_game = concatenar_dataframes(aba_po_game, 'ABA', 'po_game')
df_ABA_playoffs_minute = concatenar_dataframes(aba_po_minute, 'ABA', 'po_minute')
df_ABA_playoffs_advanced = concatenar_dataframes(aba_po_advanced, 'ABA', 'po_advanced')

In [28]:
# concatenar todos los datos de la NBA, por temporada regular o playoffs, y por tipo de datos, para luego generar
# unos ficheros totales, y no tener separados los datos por año.
df_NBA_season_game = concatenar_dataframes(nba_game, 'NBA', 'game')
df_NBA_season_minute = concatenar_dataframes(nba_minute, 'NBA', 'minute')
df_NBA_season_advanced = concatenar_dataframes(nba_advanced, 'NBA', 'advanced')
df_NBA_season_poss = concatenar_dataframes(nba_poss, 'NBA', 'poss')

df_NBA_playoffs_game = concatenar_dataframes(nba_po_game, 'NBA', 'po_game')
df_NBA_playoffs_minute = concatenar_dataframes(nba_po_minute, 'NBA', 'po_minute')
df_NBA_playoffs_advanced = concatenar_dataframes(nba_po_advanced, 'NBA', 'po_advanced')
df_NBA_playoffs_poss = concatenar_dataframes(nba_po_poss, 'NBA', 'po_poss')

In [29]:
# visualizar alguno de los dataframes generados
print("Dataset NBA per game: num_rows: %d\tColumnas: %d\n" % (df_NBA_season_game.shape[0],\
                                                              df_NBA_season_game.shape[1]))

Dataset NBA per game: num_rows: 27069	Columnas: 30



In [30]:
df_NBA_season_game.head(20)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,John Abramovic,F,27,PIT,47,,,4.3,17.7,0.242,...,,,,0.7,,,,3.4,11.2,1947
1,Chet Aubuchon,G,30,DTF,30,,,0.8,3.0,0.253,...,,,,0.7,,,,1.5,2.2,1947
2,Norm Baker,G,23,CHS,4,,,0.0,0.3,0.0,...,,,,0.0,,,,0.0,0.0,1947
3,Herschel Baltimore,F,25,STB,58,,,0.9,4.5,0.202,...,,,,0.3,,,,1.7,2.4,1947
4,John Barr,F,28,STB,58,,,2.1,7.6,0.283,...,,,,0.9,,,,2.8,5.1,1947
5,Frankie Baumholtz,G,28,CLR,45,,,5.7,19.0,0.298,...,,,,1.2,,,,2.1,14.0,1947
6,Moe Becker,G-F,29,TOT,43,,,1.6,8.3,0.196,...,,,,0.7,,,,2.3,3.8,1947
7,Moe Becker,G-F,29,PIT,17,,,2.7,13.5,0.201,...,,,,0.8,,,,2.9,6.4,1947
8,Moe Becker,G-F,29,BOS,6,,,0.8,3.7,0.227,...,,,,0.2,,,,2.5,2.2,1947
9,Moe Becker,G-F,29,DTF,20,,,1.0,5.4,0.178,...,,,,0.8,,,,1.7,2.1,1947


In [31]:
# ver los últimos datos del dataframe
df_NBA_season_game.tail(20)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
415,PJ Washington,PF,21,CHO,7,7,32.3,5.7,10.0,0.571,...,0.9,5.3,6.1,1.4,1.1,0.4,2.0,2.6,15.1,2020
416,Russell Westbrook,PG,31,HOU,7,7,32.9,7.9,17.1,0.458,...,1.9,7.7,9.6,8.7,1.9,0.6,4.0,3.1,20.7,2020
417,Coby White,PG,19,CHI,9,0,23.6,4.2,11.8,0.358,...,0.2,3.1,3.3,2.6,0.6,0.0,1.2,0.9,11.2,2020
418,Derrick White,PG,25,SAS,7,1,23.0,4.3,8.3,0.517,...,0.9,1.9,2.7,1.9,0.6,0.6,0.7,2.9,12.4,2020
420,Hassan Whiteside,C,30,POR,6,6,25.8,5.8,8.8,0.66,...,3.3,8.7,12.0,0.7,0.5,1.3,2.3,3.5,14.3,2020
421,Andrew Wiggins,SF,24,MIN,7,7,32.7,8.4,18.9,0.447,...,1.6,3.1,4.7,2.3,0.6,0.9,1.7,2.4,22.4,2020
422,Grant Williams,PF,21,BOS,6,0,16.8,1.3,4.0,0.333,...,1.2,1.5,2.7,1.0,0.7,0.5,0.7,1.7,3.8,2020
423,Kenrich Williams,PF,25,NOP,7,0,18.1,1.7,4.1,0.414,...,1.4,2.7,4.1,1.4,1.0,0.7,0.4,2.4,5.1,2020
424,Lou Williams,SG,33,LAC,8,2,31.9,7.0,17.5,0.4,...,0.6,3.1,3.8,5.5,0.4,0.0,3.0,1.6,22.0,2020
425,Marvin Williams,PF,33,CHO,7,0,17.7,2.6,5.0,0.514,...,0.6,1.4,2.0,0.7,0.4,0.1,0.6,1.4,7.3,2020


In [32]:
# guardarlos en csv, json y excel como los anteriores
# datos totales ABA
lista_dfs_ABA = (df_ABA_season_game, df_ABA_season_minute, df_ABA_season_advanced,\
                df_ABA_playoffs_game, df_ABA_playoffs_minute, df_ABA_playoffs_advanced)

option_ABA = ('game', 'minute', 'advanced', 'po_game', 'po_minute', 'po_advanced')
generar_ficheros_totales(lista_dfs_ABA, 'ABA', option_ABA)

# datos totales NBA
lista_dfs_NBA = (df_NBA_season_game, df_NBA_season_minute, df_NBA_season_advanced, df_NBA_season_poss,\
                df_NBA_playoffs_game, df_NBA_playoffs_minute, df_NBA_playoffs_advanced, df_NBA_playoffs_poss)

option_NBA = ('game', 'minute', 'advanced', 'poss', 'po_game', 'po_minute', 'po_advanced', 'po_poss')
generar_ficheros_totales(lista_dfs_NBA, 'NBA', option_NBA)

In [33]:
# de los datasets completos con todas las temporadas NBA, vamos a generar los siguientes: uno con los datos sin
# reloj de posesión, otro con los datos con reloj de posesión, y también teniendo en cuenta si hay tiro de 3pts.
generar_dataframes_epocas(lista_dfs_NBA)

In [36]:
# generar dataset por equipo NBA, probamos con BOSTON.
generar_dataframes_team(lista_dfs_NBA, 'BOS', option_NBA)

In [37]:
# cargamos y visualizamos uno de los datasets generados solo con los datos de Boston Celtics
df_BOS_game = pd.read_csv('./Datasets/csv/BOS_game.csv', sep=',')

print("Dataset NBA per game: num_rows: %d\tColumnas: %d\n" % (df_BOS_game.shape[0], df_BOS_game.shape[1]))

Dataset NBA per game: num_rows: 1113	Columnas: 30



In [39]:
df_BOS_game.tail(20)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
1093,Daniel Theis,C,26,BOS,66.0,2.0,13.8,2.2,4.0,0.549,...,1.3,2.1,3.4,1.0,0.3,0.6,0.5,2.4,5.7,2019
1094,Brad Wanamaker,PG,29,BOS,36.0,0.0,9.5,1.4,2.9,0.476,...,0.1,1.1,1.1,1.6,0.3,0.1,0.5,0.9,3.9,2019
1095,Robert Williams,C,21,BOS,32.0,2.0,8.8,1.1,1.6,0.706,...,0.8,1.7,2.5,0.2,0.3,1.3,0.3,1.1,2.5,2019
1096,Guerschon Yabusele,PF,23,BOS,41.0,1.0,6.1,0.9,1.9,0.455,...,0.6,0.7,1.3,0.4,0.2,0.2,0.4,0.8,2.3,2019
1097,Jaylen Brown,SG,23,BOS,3.0,3.0,28.7,6.7,13.3,0.5,...,0.3,6.3,6.7,2.0,0.7,1.0,1.3,3.7,17.3,2020
1098,Carsen Edwards,PG,21,BOS,5.0,0.0,13.2,2.0,5.6,0.357,...,0.4,1.0,1.4,1.0,0.4,0.2,0.0,1.0,5.8,2020
1099,Tacko Fall,C,24,BOS,1.0,0.0,4.0,2.0,4.0,0.5,...,0.0,3.0,3.0,0.0,0.0,0.0,1.0,2.0,4.0,2020
1100,Javonte Green,SG,26,BOS,4.0,0.0,2.3,0.0,0.0,,...,0.0,0.5,0.5,0.3,0.0,0.0,0.0,0.0,0.0,2020
1101,Gordon Hayward,SF,29,BOS,6.0,6.0,34.2,8.0,14.2,0.565,...,0.8,6.7,7.5,4.3,0.7,0.2,2.3,2.2,20.3,2020
1102,Enes Kanter,C,27,BOS,1.0,1.0,25.0,5.0,8.0,0.625,...,3.0,3.0,6.0,2.0,0.0,0.0,0.0,3.0,12.0,2020


In [40]:
# obtenemos lista con todos los equipos que vengan en los dataframes de datos NBA
lista_teams = df_NBA_season_game['Tm'].unique().tolist()
print(lista_teams)

['PIT', 'DTF', 'CHS', 'STB', 'CLR', 'TOT', 'BOS', 'PRO', 'TRH', 'NYK', 'PHW', 'WSC', 'BLB', 'FTW', 'INJ', 'MNL', 'ROC', 'INO', 'DNN', 'TRI', 'AND', 'WAT', 'SHE', 'SYR', 'MLH', 'STL', 'DET', 'CIN', 'LAL', 'CHP', 'SFW', 'CHZ', 'BAL', 'PHI', 'CHI', 'SDR', 'SEA', 'MIL', 'ATL', 'PHO', 'POR', 'CLE', 'BUF', 'HOU', 'GSW', 'KCO', 'CAP', 'NOJ', 'WSB', 'KCK', 'IND', 'NYN', 'DEN', 'SAS', 'NJN', 'SDC', 'UTA', 'DAL', 'LAC', 'SAC', 'CHH', 'MIA', 'ORL', 'MIN', 'VAN', 'TOR', 'WAS', 'MEM', 'NOH', 'CHA', 'NOK', 'OKC', 'BRK', 'NOP', 'CHO']


In [43]:
# generar datasets para cada uno de los equipos de la NBA. Para ello primero obtenemos una lista con todos los
# equipos de los que haya datos de la NBA.
generar_dataframes_teams(lista_dfs_NBA, lista_teams, option_NBA)

In [44]:
# visualizar alguno de los datasets generados por equipo
df_SEA_game = pd.read_csv('./Datasets/csv/SEA_game.csv', sep=',')

print("Dataset Seattle Supersonics per game: num_rows: %d\tColumnas: %d\n" % (df_SEA_game.shape[0],\
                                                                              df_SEA_game.shape[1]))

Dataset Seattle Supersonics per game: num_rows: 606	Columnas: 30



In [45]:
df_SEA_game.tail(20)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
586,Ronald Dupree,SF,27,SEA,4,0.0,4.5,0.3,0.8,0.333,...,0.3,1.8,2.0,0.3,0.3,0.0,0.0,0.0,1.0,2008
587,Kevin Durant,SG,19,SEA,80,80.0,34.6,7.3,17.1,0.43,...,0.9,3.5,4.4,2.4,1.0,0.9,2.9,1.5,20.3,2008
588,Francisco Elson,C,31,SEA,22,2.0,12.7,1.4,4.0,0.341,...,0.8,2.2,3.0,0.4,0.3,0.3,0.8,1.5,3.0,2008
589,Mickaël Gelabale,SG,24,SEA,39,0.0,11.9,1.8,4.0,0.439,...,0.4,1.1,1.5,0.8,0.3,0.2,0.7,0.8,4.3,2008
590,Eddie Gill,PG,29,SEA,1,0.0,5.0,0.0,2.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2008
591,Jeff Green,SF,21,SEA,80,52.0,28.2,4.0,9.4,0.427,...,1.3,3.5,4.7,1.5,0.6,0.6,2.0,2.5,10.5,2008
592,Adrian Griffin,SF,33,SEA,13,0.0,6.5,0.5,1.2,0.375,...,0.7,1.0,1.7,0.4,0.4,0.1,0.4,0.6,1.1,2008
593,Donyell Marshall,PF,34,SEA,15,0.0,12.3,1.3,3.6,0.352,...,1.0,2.1,3.1,0.3,0.3,0.5,0.6,1.0,3.8,2008
594,Ira Newble,SF,33,SEA,2,0.0,8.5,1.0,3.5,0.286,...,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,2.0,2008
595,Johan Petro,C,22,SEA,72,28.0,18.2,2.6,6.1,0.419,...,1.4,3.8,5.1,0.4,0.5,0.6,1.0,2.4,6.0,2008
