In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import math
import pyarrow as pa
import pyarrow.parquet as pq
from fastparquet import write 

Cargamos los datos escogidos para realizar la práctica. Los datos corresponden a la liga de baloncesto NBA. Se dividen en 3 ficheros csv: "players.csv" con información de los jugadores que han pasado por la liga como podría ser el año de nacimiento, universidad o la altura y peso del jugador, "player_data.csv" que tiene información también de los jugadores donde la información relevante al anterior csv sería que añade el año inicio y fin de la carrera del jugador en la liga y la posición en la que juega, y por último el csv "Seasons_stats.csv" que contiene todas las estadísticas en los distintos apartados del juego de cada jugador en cada temporada desde el inicio de la liga.

Lo que se pretende analizar en la práctica es la influencia de la entrada en la temporada 1979-1980 de la línea de 3 en la liga, y cómo podría haber cambiado el propio juego. Para ello nos vamos a quedar con el último csv "Seasons_stats.csv" que es el que contiene mayor información y sobre todo la información que pensamos que puede ser útil para poder analizar y llegar a alguna conclusión.

In [3]:
# Carga de los datos escogidos para la práctica
# Datos de players
df_players = pd.read_csv('players.csv', sep=',')

print("num_rows: %d\tColumnas: %d\n" % (df_players.shape[0], df_players.shape[1]) )
print("Columnas:\n", list(df_players.columns))

num_rows: 3922	Columnas: 8

Columnas:
 ['Unnamed: 0', 'Player', 'height', 'weight', 'collage', 'born', 'birth_city', 'birth_state']


In [4]:
# primer vistazo a los datos de players
df_players.head()

Unnamed: 0.1,Unnamed: 0,Player,height,weight,collage,born,birth_city,birth_state
0,0,Curly Armstrong,180.0,77.0,Indiana University,1918.0,,
1,1,Cliff Barker,188.0,83.0,University of Kentucky,1921.0,Yorktown,Indiana
2,2,Leo Barnhorst,193.0,86.0,University of Notre Dame,1924.0,,
3,3,Ed Bartels,196.0,88.0,North Carolina State University,1925.0,,
4,4,Ralph Beard,178.0,79.0,University of Kentucky,1927.0,Hardinsburg,Kentucky


In [5]:
# Carga de los datos escogidos para la práctica
# Datos de segundo csv de players
df_player_data = pd.read_csv('player_data.csv', sep=',')

print("num_rows: %d\tColumnas: %d\n" % (df_player_data.shape[0], df_player_data.shape[1]) )
print("Columnas:\n", list(df_player_data.columns))

num_rows: 4550	Columnas: 8

Columnas:
 ['name', 'year_start', 'year_end', 'position', 'height', 'weight', 'birth_date', 'college']


In [6]:
# primer vistazo a los datos de players
df_player_data.head()

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State University
2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,"April 16, 1947","University of California, Los Angeles"
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University


In [7]:
# Carga de los datos escogidos para la práctica
# Datos de seasons stats
df_season_stats = pd.read_csv('Seasons_stats.csv', sep=',')

print("num_rows: %d\tColumnas: %d\n" % (df_season_stats.shape[0], df_season_stats.shape[1]) )
print("Columnas:\n", list(df_season_stats.columns))

num_rows: 24691	Columnas: 53

Columnas:
 ['Unnamed: 0', 'Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'blanl', 'OWS', 'DWS', 'WS', 'WS/48', 'blank2', 'OBPM', 'DBPM', 'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']


In [8]:
# primer vistazo a los datos de seasons stats
df_season_stats.head()

Unnamed: 0.1,Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,...,0.705,,,,176.0,,,,217.0,458.0
1,1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,...,0.708,,,,109.0,,,,99.0,279.0
2,2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,...,0.698,,,,140.0,,,,192.0,438.0
3,3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,...,0.559,,,,20.0,,,,29.0,63.0
4,4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,...,0.548,,,,20.0,,,,27.0,59.0


Para el análisis que se quiere realizar, en base a los objetivos de la práctica, nos quedamos con el tercer dataframe nada más, que tiene las estadísticas de cada jugador en cada temporada y será más que suficiente para conseguir probar la hipótesis planteada.

Para ello vamos a eliminar las columnas que no aporten a priori información relevante para el análisis, y a intentar eliminar los datos vacíos o NaNs.

In [9]:
# eliminar primera columna que actua como indice. Creamos un dataframe sin la primera columna 
df_season_stats_1 = df_season_stats.iloc[:,1:]
df_season_stats_1.head()

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,0.368,...,0.705,,,,176.0,,,,217.0,458.0
1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,0.435,...,0.708,,,,109.0,,,,99.0,279.0
2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,0.394,...,0.698,,,,140.0,,,,192.0,438.0
3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,0.312,...,0.559,,,,20.0,,,,29.0,63.0
4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,0.308,...,0.548,,,,20.0,,,,27.0,59.0


In [10]:
# en base a lo que queremos analizar vamos a eliminar todas las columnas que no tengan información que necesitemos 
# para realizar los análisis que se han pensado.
df_season_stats_2 = df_season_stats_1.drop(labels=["Age","Tm","Pos","G","GS","ORB%","DRB%","blanl","OWS","DWS","WS",\
                    "WS/48","blank2","OBPM","DBPM","BPM","VORP","ORB","DRB","ORB%","DRB%","TRB%","AST%","STL%",\
                    "BLK%","TOV%"], axis=1)

Como se quiere analizar los datos en base a ver como pudo influir la inclusión de la línea de 3 puntos, separamos el DF en dos, uno con las estadísticas anteriores a la línea de 3 y otro con los datos posteriores.

In [11]:
df_season_stats_NO3 = df_season_stats_2[df_season_stats_2['Year'] < 1980]
df_season_stats_SI3 = df_season_stats_2[df_season_stats_2['Year'] >= 1980]

print("num_rows: %d\tColumnas: %d\n" % (df_season_stats_NO3.shape[0], df_season_stats_NO3.shape[1]) )
print("num_rows: %d\tColumnas: %d\n" % (df_season_stats_SI3.shape[0], df_season_stats_SI3.shape[1]) )

num_rows: 5697	Columnas: 28

num_rows: 18927	Columnas: 28



In [12]:
# ver los datos NaN en cada columna para ver que hacer con esos datos o con dichas columnas.
df_season_stats_NO3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5697 entries, 0 to 5725
Data columns (total 28 columns):
Year      5697 non-null float64
Player    5697 non-null object
MP        5211 non-null float64
PER       5179 non-null float64
TS%       5687 non-null float64
3PAr      0 non-null float64
FTr       5686 non-null float64
USG%      718 non-null float64
FG        5697 non-null float64
FGA       5697 non-null float64
FG%       5686 non-null float64
3P        0 non-null float64
3PA       0 non-null float64
3P%       0 non-null float64
2P        5697 non-null float64
2PA       5697 non-null float64
2P%       5686 non-null float64
eFG%      5686 non-null float64
FT        5697 non-null float64
FTA       5697 non-null float64
FT%       5584 non-null float64
TRB       5385 non-null float64
AST       5697 non-null float64
STL       1870 non-null float64
BLK       1870 non-null float64
TOV       718 non-null float64
PF        5697 non-null float64
PTS       5697 non-null float64
dtypes: floa

In [13]:
# eliminar columnas que tienen todos los valores vacíos. En el DF de datos anteriores al tiro de 3, las columnas que 
# indican información sobre el mismo están vacías y se eliminan.
df_season_NO3 = df_season_stats_NO3.drop(labels=["3PAr","3P","3PA","3P%"], axis=1)
df_season_NO3.head()

Unnamed: 0,Year,Player,MP,PER,TS%,FTr,USG%,FG,FGA,FG%,...,FT,FTA,FT%,TRB,AST,STL,BLK,TOV,PF,PTS
0,1950.0,Curly Armstrong,,,0.368,0.467,,144.0,516.0,0.279,...,170.0,241.0,0.705,,176.0,,,,217.0,458.0
1,1950.0,Cliff Barker,,,0.435,0.387,,102.0,274.0,0.372,...,75.0,106.0,0.708,,109.0,,,,99.0,279.0
2,1950.0,Leo Barnhorst,,,0.394,0.259,,174.0,499.0,0.349,...,90.0,129.0,0.698,,140.0,,,,192.0,438.0
3,1950.0,Ed Bartels,,,0.312,0.395,,22.0,86.0,0.256,...,19.0,34.0,0.559,,20.0,,,,29.0,63.0
4,1950.0,Ed Bartels,,,0.308,0.378,,21.0,82.0,0.256,...,17.0,31.0,0.548,,20.0,,,,27.0,59.0


In [14]:
# uso de describe para ver valores estadísticos
df_season_NO3.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,5697.0,1967.088468,9.319069,1950.0,1959.0,1970.0,1975.0,1979.0
MP,5211.0,1383.029553,981.636666,1.0,473.0,1308.0,2200.0,3882.0
PER,5179.0,12.783626,5.39852,-60.3,10.3,12.8,15.6,77.0
TS%,5687.0,0.457002,0.084939,0.0,0.419,0.468,0.507,1.042
FTr,5686.0,0.344889,0.174058,0.0,0.244,0.323,0.41775,3.0
USG%,718.0,19.401532,4.817401,0.0,16.225,19.3,22.4,40.0
FG,5697.0,229.227839,205.801521,0.0,60.0,179.0,341.0,1597.0
FGA,5697.0,528.626119,441.615315,0.0,156.0,435.0,795.0,3159.0
FG%,5686.0,0.406173,0.088446,0.0,0.365,0.419,0.458,1.0
2P,5697.0,229.227839,205.801521,0.0,60.0,179.0,341.0,1597.0


In [15]:
# sustituir los valores vacíos que haya en las columnas del DF por el valor más adecuado. En este caso se piensa que
# la mejor opción para los datos vacíos es rellenarlos con la mediana de ese atributo y así influirá menos en los
# futuros análisis.
df_season_NO3_final = df_season_NO3.fillna(df_season_NO3.median())
df_season_NO3_final.head()

Unnamed: 0,Year,Player,MP,PER,TS%,FTr,USG%,FG,FGA,FG%,...,FT,FTA,FT%,TRB,AST,STL,BLK,TOV,PF,PTS
0,1950.0,Curly Armstrong,1308.0,12.8,0.368,0.467,19.3,144.0,516.0,0.279,...,170.0,241.0,0.705,221.0,176.0,41.0,14.0,95.5,217.0,458.0
1,1950.0,Cliff Barker,1308.0,12.8,0.435,0.387,19.3,102.0,274.0,0.372,...,75.0,106.0,0.708,221.0,109.0,41.0,14.0,95.5,99.0,279.0
2,1950.0,Leo Barnhorst,1308.0,12.8,0.394,0.259,19.3,174.0,499.0,0.349,...,90.0,129.0,0.698,221.0,140.0,41.0,14.0,95.5,192.0,438.0
3,1950.0,Ed Bartels,1308.0,12.8,0.312,0.395,19.3,22.0,86.0,0.256,...,19.0,34.0,0.559,221.0,20.0,41.0,14.0,95.5,29.0,63.0
4,1950.0,Ed Bartels,1308.0,12.8,0.308,0.378,19.3,21.0,82.0,0.256,...,17.0,31.0,0.548,221.0,20.0,41.0,14.0,95.5,27.0,59.0


In [16]:
# realizamos los mismos pasos para el DF de los datos posteriores a la implantación de la línea de 3 en la liga
df_season_stats_SI3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18927 entries, 5727 to 24690
Data columns (total 28 columns):
Year      18927 non-null float64
Player    18927 non-null object
MP        18927 non-null float64
PER       18922 non-null float64
TS%       18851 non-null float64
3PAr      18839 non-null float64
FTr       18839 non-null float64
USG%      18922 non-null float64
FG        18927 non-null float64
FGA       18927 non-null float64
FG%       18839 non-null float64
3P        18927 non-null float64
3PA       18927 non-null float64
3P%       15416 non-null float64
2P        18927 non-null float64
2PA       18927 non-null float64
2P%       18810 non-null float64
eFG%      18839 non-null float64
FT        18927 non-null float64
FTA       18927 non-null float64
FT%       18182 non-null float64
TRB       18927 non-null float64
AST       18927 non-null float64
STL       18927 non-null float64
BLK       18927 non-null float64
TOV       18927 non-null float64
PF        18927 non-null float6

In [17]:
# en las columnas que tengan algún valor vacío sustituirlo por el valor más adecuado
df_season_stats_SI3.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,18927.0,2000.272415,10.691977,1980.0,1992.0,2001.0,2010.0,2017.0
MP,18927.0,1162.004649,924.026516,0.0,313.0,985.0,1894.0,3533.0
PER,18922.0,12.395714,6.200326,-90.6,9.7,12.7,15.6,129.1
TS%,18851.0,0.503862,0.094507,0.0,0.473,0.516,0.551,1.136
3PAr,18839.0,0.158604,0.187495,0.0,0.005,0.064,0.288,1.0
FTr,18839.0,0.31959,0.230499,0.0,0.197,0.286,0.395,6.0
USG%,18922.0,18.887707,5.469891,0.0,15.3,18.6,22.1,100.0
FG,18927.0,185.121361,181.223343,0.0,36.0,130.0,286.0,1098.0
FGA,18927.0,401.153801,378.474645,0.0,86.0,291.0,620.0,2279.0
FG%,18839.0,0.438255,0.096844,0.0,0.401,0.444,0.487,1.0


In [18]:
df_season_SI3_final = df_season_stats_SI3.fillna(df_season_stats_SI3.median())
df_season_SI3_final.head()

Unnamed: 0,Year,Player,MP,PER,TS%,3PAr,FTr,USG%,FG,FGA,...,FT,FTA,FT%,TRB,AST,STL,BLK,TOV,PF,PTS
5727,1980.0,Kareem Abdul-Jabbar*,3143.0,25.3,0.639,0.001,0.344,24.1,835.0,1383.0,...,364.0,476.0,0.765,886.0,371.0,81.0,280.0,297.0,216.0,2034.0
5728,1980.0,Tom Abernethy,1222.0,11.0,0.511,0.003,0.258,13.3,153.0,318.0,...,56.0,82.0,0.683,191.0,87.0,35.0,12.0,39.0,118.0,362.0
5729,1980.0,Alvan Adams,2168.0,19.2,0.571,0.002,0.27,21.9,465.0,875.0,...,188.0,236.0,0.797,609.0,322.0,108.0,55.0,218.0,237.0,1118.0
5730,1980.0,Tiny Archibald*,2864.0,15.3,0.574,0.023,0.548,17.0,383.0,794.0,...,361.0,435.0,0.83,197.0,671.0,106.0,10.0,242.0,218.0,1131.0
5731,1980.0,Dennis Awtrey,560.0,7.4,0.524,0.0,0.833,7.9,27.0,60.0,...,32.0,50.0,0.64,115.0,40.0,12.0,15.0,27.0,66.0,86.0


Una vez tenemos los dos DF finales, de datos anteriores a 1980 y los datos posteriores, generamos ambos .csv y parquet para subirlos al HDFS, almacenarlos y analizarlos.

In [22]:
df_season_NO3_final.to_csv('df_season_NO3_final.csv')
table = pa.Table.from_pandas(df_season_NO3_final, preserve_index=True)
pq.write_table(table, 'df_season_NO3_final.parquet')
df_season_SI3_final.to_csv('df_season_SI3_final.csv')
table_si = pa.Table.from_pandas(df_season_SI3_final, preserve_index=True)
pq.write_table(table_si, 'df_season_SI3_final.parquet')