### Data Cleaning

In [57]:
import pandas as pd
import numpy as np

In [7]:
pd.set_option("display.max_columns", None)

# Games.csv

In [15]:
games = pd.read_csv("data/games_raw.csv", index_col="id")

In [16]:
games.head()

Unnamed: 0_level_0,date,home_team_score,period,postseason,season,status,time,visitor_team_score,home_team.id,home_team.abbreviation,home_team.city,home_team.conference,home_team.division,home_team.full_name,home_team.name,visitor_team.id,visitor_team.abbreviation,visitor_team.city,visitor_team.conference,visitor_team.division,visitor_team.full_name,visitor_team.name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
47179,2019-01-30T00:00:00.000Z,126,4,False,2018,Final,,94,2,BOS,Boston,East,Atlantic,Boston Celtics,Celtics,4,CHA,Charlotte,East,Southeast,Charlotte Hornets,Hornets
48751,2019-02-09T00:00:00.000Z,112,4,False,2018,Final,,123,2,BOS,Boston,East,Atlantic,Boston Celtics,Celtics,13,LAC,LA,West,Pacific,LA Clippers,Clippers
48739,2019-02-08T00:00:00.000Z,117,4,False,2018,Final,,110,23,PHI,Philadelphia,East,Atlantic,Philadelphia 76ers,76ers,8,DEN,Denver,West,Northwest,Denver Nuggets,Nuggets
48740,2019-02-08T00:00:00.000Z,119,4,False,2018,Final,,106,30,WAS,Washington,East,Southeast,Washington Wizards,Wizards,6,CLE,Cleveland,East,Central,Cleveland Cavaliers,Cavaliers
48746,2019-02-08T00:00:00.000Z,102,4,False,2018,Final,,96,26,SAC,Sacramento,West,Pacific,Sacramento Kings,Kings,16,MIA,Miami,East,Southeast,Miami Heat,Heat


In [17]:
# time column is empty
games.drop("time", axis=1, inplace=True)

In [21]:
# Insufficient data to use 2021 games
# Also some of these games were 'in progress' when pulled from the api, so stats are incomplete
games_2021 = games[games["season"].eq(2021)].index
games.drop(games_2021, axis=0, inplace=True)

In [25]:
# dropping a random unfinished game
games.drop(games[games["status"].eq("8:30 PM ET")].index, axis=0, inplace=True)

In [52]:
# dropping unplayed games
games.drop(games[games["home_team_score"].eq(0)].index, axis=0, inplace=True)

In [55]:
# dropping unecessary name info
games.drop(["home_team.city", "visitor_team.city", "home_team.name", "visitor_team.name"], axis=1, inplace=True)

In [56]:
games.head()

Unnamed: 0_level_0,date,home_team_score,period,postseason,season,status,visitor_team_score,home_team.id,home_team.abbreviation,home_team.conference,home_team.division,home_team.full_name,visitor_team.id,visitor_team.abbreviation,visitor_team.conference,visitor_team.division,visitor_team.full_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
47179,2019-01-30T00:00:00.000Z,126,4,False,2018,Final,94,2,BOS,East,Atlantic,Boston Celtics,4,CHA,East,Southeast,Charlotte Hornets
48751,2019-02-09T00:00:00.000Z,112,4,False,2018,Final,123,2,BOS,East,Atlantic,Boston Celtics,13,LAC,West,Pacific,LA Clippers
48739,2019-02-08T00:00:00.000Z,117,4,False,2018,Final,110,23,PHI,East,Atlantic,Philadelphia 76ers,8,DEN,West,Northwest,Denver Nuggets
48740,2019-02-08T00:00:00.000Z,119,4,False,2018,Final,106,30,WAS,East,Southeast,Washington Wizards,6,CLE,East,Central,Cleveland Cavaliers
48746,2019-02-08T00:00:00.000Z,102,4,False,2018,Final,96,26,SAC,West,Pacific,Sacramento Kings,16,MIA,East,Southeast,Miami Heat


In [58]:
games.to_csv("data/games_clean.csv")

# Players.csv

In [73]:
players = pd.read_csv("data/players_raw.csv", index_col="id")

In [74]:
# drop superfluous team info
players.drop(["team.name", "team.division", "team.conference", "team.city", "team.abbreviation"], axis=1, inplace=True)

In [77]:
# reorganize columns
players = players[['first_name', 'last_name', 'position', 'team.id', 'team.full_name',
                   'height_feet', 'height_inches', 'weight_pounds']]

In [78]:
players

Unnamed: 0_level_0,first_name,last_name,position,team.id,team.full_name,height_feet,height_inches,weight_pounds
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
14,Ike,Anigbogu,C,12,Indiana Pacers,,,
25,Ron,Baker,G,20,New York Knicks,,,
47,Jabari,Bird,G,2,Boston Celtics,,,
67,MarShon,Brooks,G,15,Memphis Grizzlies,,,
71,Lorenzo,Brown,G,28,Toronto Raptors,,,
...,...,...,...,...,...,...,...,...
17896079,Rokas,Jokubaitis,G,20,New York Knicks,,,
17896080,EJ,Onu,F-C,7,Dallas Mavericks,,,
17896116,Terry,Taylor,G,12,Indiana Pacers,,,
17896118,Marcus,Zegarowski,G,3,Brooklyn Nets,,,


In [79]:
players.isnull().sum()

first_name           0
last_name            1
position          2549
team.id              0
team.full_name       0
height_feet       3255
height_inches     3255
weight_pounds     3255
dtype: int64

# Stats.csv

In [184]:
stats = pd.read_csv("data/stats2020.csv", index_col="id")

In [185]:
stats.drop(["game.period", "game.postseason", "game.status", "game.time", "player.height_feet", "player.height_inches",
            "player.weight_pounds", "team.abbreviation", "team.city", "team.conference", "team.division", "team.name",
            "player", "player.first_name", "player.last_name", "player.position", "team.full_name", "player.team_id"],
          axis=1, inplace=True)

In [186]:
stats.dropna(axis=0, how="any", inplace=True)

In [187]:
played_0min = stats[stats["min"].eq("0:00") | stats["min"].eq("0")].index
stats.drop(played_0min, axis=0, inplace=True)

In [188]:
stats.columns = ['ast', 'blk', 'dreb', 'fg3_pct', 'fg3a', 'fg3m', 'fg_pct', 'fga', 'fgm',
       'ft_pct', 'fta', 'ftm', 'min', 'oreb', 'pf', 'pts', 'reb', 'stl',
       'turnover', 'game_id', 'game_date', 'home_team_id',
       'home_team_score', 'season', 'away_team_id',
       'away_team_score', 'player_id', 'player_team_id']

In [189]:
stats = stats[['ast', 'blk', 'dreb', 'fg3_pct', 'fg3a', 'fg3m', 'fg_pct', 'fga', 'fgm',
       'ft_pct', 'fta', 'ftm', 'min', 'oreb', 'pf', 'pts', 'reb', 'stl',
       'turnover', 'player_id', 'player_team_id', 'game_id', 'game_date', 'season', 'home_team_id',
       'home_team_score','away_team_id',
       'away_team_score']]

In [190]:
stats

Unnamed: 0_level_0,ast,blk,dreb,fg3_pct,fg3a,fg3m,fg_pct,fga,fgm,ft_pct,fta,ftm,min,oreb,pf,pts,reb,stl,turnover,player_id,player_team_id,game_id,game_date,season,home_team_id,home_team_score,away_team_id,away_team_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
2848769,3,1,4,50.0,2,1,43.8,16,7,100.0,7,7,24:56,1,3,22,5,3,1,140.0,3,127502,2020-12-22T00:00:00.000Z,2020,3,125,10,99
2848775,1,0,0,50.0,2,1,75.0,4,3,0.0,0,0,17:17,0,3,7,0,1,1,188.0,3,127502,2020-12-22T00:00:00.000Z,2020,3,125,10,99
2848770,1,1,9,0.0,0,0,66.7,3,2,0.0,2,0,17:03,2,2,4,11,1,3,250.0,3,127502,2020-12-22T00:00:00.000Z,2020,3,125,10,99
2848759,10,0,1,20.0,10,2,33.3,21,7,100.0,4,4,30:19,3,1,20,4,2,3,115.0,10,127502,2020-12-22T00:00:00.000Z,2020,3,125,10,99
2848772,4,0,3,57.1,7,4,62.5,16,10,100.0,2,2,25:18,1,3,26,4,0,1,228.0,3,127502,2020-12-22T00:00:00.000Z,2020,3,125,10,99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6966959,5,0,5,25.0,4,1,46.2,13,6,100.0,4,4,40:38,0,3,17,5,4,5,315.0,17,448337,2021-07-20T00:00:00.000Z,2020,17,105,24,98
6966960,11,0,8,28.6,7,2,21.1,19,4,100.0,2,2,46:17,1,1,12,9,4,3,214.0,17,448337,2021-07-20T00:00:00.000Z,2020,17,105,24,98
6966968,0,1,3,40.0,5,2,60.0,10,6,100.0,2,2,22:47,0,5,16,3,0,0,377.0,17,448337,2021-07-20T00:00:00.000Z,2020,17,105,24,98
6966962,1,0,6,0.0,4,0,0.0,4,0,0.0,2,0,22:55,2,2,0,8,0,2,105.0,17,448337,2021-07-20T00:00:00.000Z,2020,17,105,24,98


In [191]:
stats.to_csv("data/stats2020_clean.csv")