### Data Cleaning

In [1]:
import pandas as pd
import numpy as np

In [9]:
pd.set_option("display.max_columns", None)

# Games.csv

In [15]:
games = pd.read_csv("data/games_raw.csv", index_col="id")

In [16]:
games.head()

Unnamed: 0_level_0,date,home_team_score,period,postseason,season,status,time,visitor_team_score,home_team.id,home_team.abbreviation,home_team.city,home_team.conference,home_team.division,home_team.full_name,home_team.name,visitor_team.id,visitor_team.abbreviation,visitor_team.city,visitor_team.conference,visitor_team.division,visitor_team.full_name,visitor_team.name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
47179,2019-01-30T00:00:00.000Z,126,4,False,2018,Final,,94,2,BOS,Boston,East,Atlantic,Boston Celtics,Celtics,4,CHA,Charlotte,East,Southeast,Charlotte Hornets,Hornets
48751,2019-02-09T00:00:00.000Z,112,4,False,2018,Final,,123,2,BOS,Boston,East,Atlantic,Boston Celtics,Celtics,13,LAC,LA,West,Pacific,LA Clippers,Clippers
48739,2019-02-08T00:00:00.000Z,117,4,False,2018,Final,,110,23,PHI,Philadelphia,East,Atlantic,Philadelphia 76ers,76ers,8,DEN,Denver,West,Northwest,Denver Nuggets,Nuggets
48740,2019-02-08T00:00:00.000Z,119,4,False,2018,Final,,106,30,WAS,Washington,East,Southeast,Washington Wizards,Wizards,6,CLE,Cleveland,East,Central,Cleveland Cavaliers,Cavaliers
48746,2019-02-08T00:00:00.000Z,102,4,False,2018,Final,,96,26,SAC,Sacramento,West,Pacific,Sacramento Kings,Kings,16,MIA,Miami,East,Southeast,Miami Heat,Heat


In [17]:
# time column is empty
games.drop("time", axis=1, inplace=True)

In [21]:
# Insufficient data to use 2021 games
# Also some of these games were 'in progress' when pulled from the api, so stats are incomplete
games_2021 = games[games["season"].eq(2021)].index
games.drop(games_2021, axis=0, inplace=True)

In [25]:
# dropping a random unfinished game
games.drop(games[games["status"].eq("8:30 PM ET")].index, axis=0, inplace=True)

In [52]:
# dropping unplayed games
games.drop(games[games["home_team_score"].eq(0)].index, axis=0, inplace=True)

In [55]:
# dropping unecessary name info
games.drop(["home_team.city", "visitor_team.city", "home_team.name", "visitor_team.name"], axis=1, inplace=True)

In [56]:
games.head()

Unnamed: 0_level_0,date,home_team_score,period,postseason,season,status,visitor_team_score,home_team.id,home_team.abbreviation,home_team.conference,home_team.division,home_team.full_name,visitor_team.id,visitor_team.abbreviation,visitor_team.conference,visitor_team.division,visitor_team.full_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
47179,2019-01-30T00:00:00.000Z,126,4,False,2018,Final,94,2,BOS,East,Atlantic,Boston Celtics,4,CHA,East,Southeast,Charlotte Hornets
48751,2019-02-09T00:00:00.000Z,112,4,False,2018,Final,123,2,BOS,East,Atlantic,Boston Celtics,13,LAC,West,Pacific,LA Clippers
48739,2019-02-08T00:00:00.000Z,117,4,False,2018,Final,110,23,PHI,East,Atlantic,Philadelphia 76ers,8,DEN,West,Northwest,Denver Nuggets
48740,2019-02-08T00:00:00.000Z,119,4,False,2018,Final,106,30,WAS,East,Southeast,Washington Wizards,6,CLE,East,Central,Cleveland Cavaliers
48746,2019-02-08T00:00:00.000Z,102,4,False,2018,Final,96,26,SAC,West,Pacific,Sacramento Kings,16,MIA,East,Southeast,Miami Heat


In [58]:
games.to_csv("data/games_clean.csv")

# Players.csv

In [73]:
players = pd.read_csv("data/players_raw.csv", index_col="id")

In [74]:
# drop superfluous team info
players.drop(["team.name", "team.division", "team.conference", "team.city", "team.abbreviation"], axis=1, inplace=True)

In [77]:
# reorganize columns
players = players[['first_name', 'last_name', 'position', 'team.id', 'team.full_name',
                   'height_feet', 'height_inches', 'weight_pounds']]

In [78]:
players

Unnamed: 0_level_0,first_name,last_name,position,team.id,team.full_name,height_feet,height_inches,weight_pounds
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
14,Ike,Anigbogu,C,12,Indiana Pacers,,,
25,Ron,Baker,G,20,New York Knicks,,,
47,Jabari,Bird,G,2,Boston Celtics,,,
67,MarShon,Brooks,G,15,Memphis Grizzlies,,,
71,Lorenzo,Brown,G,28,Toronto Raptors,,,
...,...,...,...,...,...,...,...,...
17896079,Rokas,Jokubaitis,G,20,New York Knicks,,,
17896080,EJ,Onu,F-C,7,Dallas Mavericks,,,
17896116,Terry,Taylor,G,12,Indiana Pacers,,,
17896118,Marcus,Zegarowski,G,3,Brooklyn Nets,,,


In [79]:
players.isnull().sum()

first_name           0
last_name            1
position          2549
team.id              0
team.full_name       0
height_feet       3255
height_inches     3255
weight_pounds     3255
dtype: int64

# Stats.csv

In [161]:
stats = pd.read_csv("data/all_stats_raw.csv", index_col="id")

  exec(code_obj, self.user_global_ns, self.user_ns)
  mask |= (ar1 == a)


In [162]:
# drop columns with superfluous information
stats.drop(["game.period", "game.postseason", "game.status", "game.time", "player.height_feet", "player.height_inches",
            "player.weight_pounds", "team.abbreviation", "team.city", "team.conference", "team.division", "team.name",
            "player", "player.first_name", "player.last_name", "player.position", "team.full_name", "player.team_id"],
          axis=1, inplace=True)

In [163]:
# a null value generally indicates that the player did not play in that game
stats.dropna(axis=0, how="any", inplace=True)

In [164]:
# clean time column to get a consitent format. ("mm:ss" or "m:ss")
stats["min"] = stats["min"].astype(str)

# drop the row if the player didn't play in the game
played_0min = stats[stats["min"].eq("0:00") | stats["min"].str.startswith("0")].index
stats.drop(played_0min, axis=0, inplace=True)

# Convert times like "27.0" to "27:0"
stats["min"] = stats["min"].str.replace(".",":")

# convert times like "27" to "27:00"
minutes_only_times = stats["min"][~stats["min"].str.contains(":")].index
stats["min"].loc[minutes_only_times] += ":00"


minutes = [time[0] for time in stats["min"].str.split(":").values]
seconds = [time[1] for time in stats["min"].str.split(":").values]

# convert times like "27:0" to "27:00"
for i, second in enumerate(seconds):
    if len(second) == 1:
        seconds[i] = second + "0"

# convert times like "8:60" to "9:00"
for i, second in enumerate(seconds):        
    if second == "60":
        seconds[i] = "00"
        minutes[i] = str(int(minutes[i]) + 1)  # increment minutes by 1

times = [":".join(list(item)) for item in list(zip(minutes,seconds))]

stats["min"] = times

  stats["min"] = stats["min"].str.replace(".",":")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [167]:
stats.columns = ['ast', 'blk', 'dreb', 'fg3_pct', 'fg3a', 'fg3m', 'fg_pct', 'fga', 'fgm',
       'ft_pct', 'fta', 'ftm', 'min', 'oreb', 'pf', 'pts', 'reb', 'stl',
       'turnover', 'game_id', 'game_date', 'home_team_id',
       'home_team_score', 'season', 'away_team_id',
       'away_team_score', 'player_id', 'player_team_id']

In [168]:
stats = stats[['ast', 'blk', 'dreb', 'fg3_pct', 'fg3a', 'fg3m', 'fg_pct', 'fga', 'fgm',
       'ft_pct', 'fta', 'ftm', 'min', 'oreb', 'pf', 'pts', 'reb', 'stl',
       'turnover', 'player_id', 'player_team_id', 'game_id', 'game_date', 'season', 'home_team_id',
       'home_team_score','away_team_id',
       'away_team_score']]

In [169]:
stats

Unnamed: 0_level_0,ast,blk,dreb,fg3_pct,fg3a,fg3m,fg_pct,fga,fgm,ft_pct,fta,ftm,min,oreb,pf,pts,reb,stl,turnover,player_id,player_team_id,game_id,game_date,season,home_team_id,home_team_score,away_team_id,away_team_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
1069008,0.0,1.0,2.0,0.200,5.0,1.0,0.333,9.0,3.0,0.000,0.0,0.0,20:08,0.0,0.0,7.0,2.0,0.0,2.0,415.0,26,45237,2019-01-17T00:00:00.000Z,2018,4,114,26,95
1069009,4.0,0.0,5.0,0.000,2.0,0.0,0.200,5.0,1.0,0.000,0.0,0.0,19:22,1.0,1.0,2.0,6.0,0.0,0.0,49.0,26,45237,2019-01-17T00:00:00.000Z,2018,4,114,26,95
1069010,4.0,1.0,5.0,0.000,0.0,0.0,0.667,6.0,4.0,0.000,0.0,0.0,27:24,6.0,2.0,8.0,11.0,3.0,2.0,91.0,26,45237,2019-01-17T00:00:00.000Z,2018,4,114,26,95
1069011,1.0,0.0,1.0,0.545,11.0,6.0,0.500,18.0,9.0,0.000,0.0,0.0,32:06,0.0,2.0,24.0,1.0,2.0,0.0,210.0,26,45237,2019-01-17T00:00:00.000Z,2018,4,114,26,95
1069012,8.0,1.0,5.0,0.000,2.0,0.0,0.400,10.0,4.0,0.667,3.0,2.0,30:30,0.0,4.0,10.0,5.0,1.0,4.0,161.0,26,45237,2019-01-17T00:00:00.000Z,2018,4,114,26,95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7286561,1.0,0.0,2.0,0.000,2.0,0.0,0.000,2.0,0.0,0.000,0.0,0.0,13:17,0.0,2.0,0.0,2.0,0.0,0.0,17554006.0,26,470197,2021-08-17T00:00:00.000Z,2021,2,67,26,100
7286573,0.0,1.0,2.0,20.000,5.0,1.0,16.700,6.0,1.0,100.000,2.0,2.0,11:04,0.0,0.0,5.0,2.0,0.0,0.0,17895875.0,2,470197,2021-08-17T00:00:00.000Z,2021,2,67,26,100
7286575,1.0,1.0,1.0,0.000,1.0,0.0,0.000,1.0,0.0,0.000,0.0,0.0,6:52,0.0,2.0,0.0,1.0,0.0,1.0,666716.0,2,470197,2021-08-17T00:00:00.000Z,2021,2,67,26,100
7286576,0.0,0.0,1.0,0.000,1.0,0.0,33.300,3.0,1.0,50.000,2.0,1.0,4:08,1.0,1.0,3.0,2.0,0.0,1.0,17895732.0,2,470197,2021-08-17T00:00:00.000Z,2021,2,67,26,100


In [170]:
stats.to_csv("data/all_stats_clean.csv")