# Title: NBA Games - Data Preparation Part 2
## Author: Andy Orie
## Date: 5th December 2022

### *Objectives:*
1. Read in the two files previously processed.
2. Clean the data.
3. Create a Team Roster dictionary and dataframe with the player names, positions and teams.
4. Create a Team name dictionary and dataframe with the team ID and name.

In [1]:
# Global Settings and Imported Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import os


%matplotlib inline
%config InlineBackend.figure_format='retina'

from matplotlib import rcParams
# Specify the figure size in inches, for both X, and Y axes
rcParams['figure.figsize'] = 12,5

from matplotlib import style
from IPython.display import display

In [2]:
# Import script 'chkdir.py' to check the working directory. "Y" if you are in the correct working directory, else "N".

import chkdir

--------------------------------------------------------------------------------
The Current Working Directory is: /Users/andyorie/Desktop/In_Use_Folders/Udacity/jupyter_notebooks/NBA_Games
Is this the correct directory (Y/ N)? y
Great!, Let us proceed


In [3]:
# Read in file and look at first few rows.

tpr_df = pd.read_csv('team_players_ranking.csv')
display(tpr_df.head(5))

Unnamed: 0.1,Unnamed: 0,TEAM_ID,CONFERENCE,TEAM,G,W,L,W_PCT,HOME_RECORD,ROAD_RECORD,SEASON,ABBREVIATION,NICKNAME,CITY,ARENACAPACITY,HEADCOACH,PLAYER_NAME,PLAYER_ID
0,0,1610612756,West,Phoenix,67,53,14,0.791,28-8,25-6,2021,,,,,,,
1,1,1610612756,West,Phoenix,67,53,14,0.791,28-8,25-6,2021,,,,,,,
2,2,1610612756,West,Phoenix,66,53,13,0.803,28-7,25-6,2021,,,,,,,
3,3,1610612756,West,Phoenix,66,53,13,0.803,28-7,25-6,2021,,,,,,,
4,4,1610612756,West,Phoenix,65,52,13,0.8,28-7,24-6,2021,,,,,,,


In [4]:
# Let's remove the 'Unnamed column' inplace.

tpr_df.pop('Unnamed: 0')
display(tpr_df.head(5))

Unnamed: 0,TEAM_ID,CONFERENCE,TEAM,G,W,L,W_PCT,HOME_RECORD,ROAD_RECORD,SEASON,ABBREVIATION,NICKNAME,CITY,ARENACAPACITY,HEADCOACH,PLAYER_NAME,PLAYER_ID
0,1610612756,West,Phoenix,67,53,14,0.791,28-8,25-6,2021,,,,,,,
1,1610612756,West,Phoenix,67,53,14,0.791,28-8,25-6,2021,,,,,,,
2,1610612756,West,Phoenix,66,53,13,0.803,28-7,25-6,2021,,,,,,,
3,1610612756,West,Phoenix,66,53,13,0.803,28-7,25-6,2021,,,,,,,
4,1610612756,West,Phoenix,65,52,13,0.8,28-7,24-6,2021,,,,,,,


In [5]:
# Let's create a team roster dataframe to use.

team_roster = tpr_df[['TEAM_ID', 'SEASON', 'CONFERENCE', 'TEAM', 'G', 'W', 'L', 'W_PCT', 'HOME_RECORD', 'ROAD_RECORD', 'PLAYER_NAME', 'PLAYER_ID']]
display(team_roster)

Unnamed: 0,TEAM_ID,SEASON,CONFERENCE,TEAM,G,W,L,W_PCT,HOME_RECORD,ROAD_RECORD,PLAYER_NAME,PLAYER_ID
0,1610612756,2021,West,Phoenix,67,53,14,0.791,28-8,25-6,,
1,1610612756,2021,West,Phoenix,67,53,14,0.791,28-8,25-6,,
2,1610612756,2021,West,Phoenix,66,53,13,0.803,28-7,25-6,,
3,1610612756,2021,West,Phoenix,66,53,13,0.803,28-7,25-6,,
4,1610612756,2021,West,Phoenix,65,52,13,0.800,28-7,24-6,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2766339,1610612752,2014,East,New York,0,0,0,0.000,0-0,0-0,JR Smith,2747.0
2766340,1610612752,2014,East,New York,0,0,0,0.000,0-0,0-0,Iman Shumpert,202697.0
2766341,1610612752,2014,East,New York,0,0,0,0.000,0-0,0-0,Travis Outlaw,2566.0
2766342,1610612752,2014,East,New York,0,0,0,0.000,0-0,0-0,DJ Mbenga,2788.0


In [6]:
# Let's remove all NaN rows under the Player Name and ID Columns.

mod_team_roster = team_roster.dropna(axis=0, subset=['PLAYER_NAME', 'PLAYER_ID'])
mod_team_roster

Unnamed: 0,TEAM_ID,SEASON,CONFERENCE,TEAM,G,W,L,W_PCT,HOME_RECORD,ROAD_RECORD,PLAYER_NAME,PLAYER_ID
13739,1610612747,2019,West,L.A. Lakers,71,52,19,0.732,25-10,27-9,LeBron James,2544.0
13740,1610612747,2019,West,L.A. Lakers,71,52,19,0.732,25-10,27-9,Anthony Davis,203076.0
13741,1610612747,2019,West,L.A. Lakers,71,52,19,0.732,25-10,27-9,JaVale McGee,201580.0
13742,1610612747,2019,West,L.A. Lakers,71,52,19,0.732,25-10,27-9,Danny Green,201980.0
13743,1610612747,2019,West,L.A. Lakers,71,52,19,0.732,25-10,27-9,Kentavious Caldwell-Pope,203484.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2766339,1610612752,2014,East,New York,0,0,0,0.000,0-0,0-0,JR Smith,2747.0
2766340,1610612752,2014,East,New York,0,0,0,0.000,0-0,0-0,Iman Shumpert,202697.0
2766341,1610612752,2014,East,New York,0,0,0,0.000,0-0,0-0,Travis Outlaw,2566.0
2766342,1610612752,2014,East,New York,0,0,0,0.000,0-0,0-0,DJ Mbenga,2788.0


In [7]:
mod_team_roster.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2687222 entries, 13739 to 2766343
Data columns (total 12 columns):
 #   Column       Dtype  
---  ------       -----  
 0   TEAM_ID      int64  
 1   SEASON       int64  
 2   CONFERENCE   object 
 3   TEAM         object 
 4   G            int64  
 5   W            int64  
 6   L            int64  
 7   W_PCT        float64
 8   HOME_RECORD  object 
 9   ROAD_RECORD  object 
 10  PLAYER_NAME  object 
 11  PLAYER_ID    float64
dtypes: float64(2), int64(5), object(5)
memory usage: 266.5+ MB


In [8]:
# What seasons do we have the team rosters for?

season_listing = mod_team_roster['SEASON'].unique()
sorted(season_listing)

[2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]

In [9]:
# Which teams are in our roster?

teams_considered = mod_team_roster['TEAM'].unique()
display(teams_considered)
print('-------------------------')
print()
print(f'There are {len(mod_team_roster["TEAM"].unique())} teams in our dataframe')

array(['L.A. Lakers', 'LA Clippers', 'Denver', 'Houston', 'Oklahoma City',
       'Utah', 'Dallas', 'Portland', 'Memphis', 'Phoenix', 'San Antonio',
       'Sacramento', 'New Orleans', 'Minnesota', 'Golden State',
       'Milwaukee', 'Toronto', 'Boston', 'Indiana', 'Miami',
       'Philadelphia', 'Brooklyn', 'Orlando', 'Washington', 'Charlotte',
       'Chicago', 'New York', 'Detroit', 'Atlanta', 'Cleveland',
       'L.A. Clippers', 'New Jersey'], dtype=object)

-------------------------

There are 32 teams in our dataframe


In [10]:
# How many games does each team play per season?

gp_roster = mod_team_roster.groupby(['TEAM_ID', 'SEASON', 'CONFERENCE', 'TEAM'], as_index=False)
gp_roster_max_games = gp_roster['G'].max()

display(gp_roster_max_games)

Unnamed: 0,TEAM_ID,SEASON,CONFERENCE,TEAM,G
0,1610612737,2009,East,Atlanta,82
1,1610612737,2010,East,Atlanta,82
2,1610612737,2011,East,Atlanta,66
3,1610612737,2012,East,Atlanta,82
4,1610612737,2013,East,Atlanta,82
...,...,...,...,...,...
325,1610612766,2015,East,Charlotte,82
326,1610612766,2016,East,Charlotte,82
327,1610612766,2017,East,Charlotte,82
328,1610612766,2018,East,Charlotte,82


In [29]:
# Find the number of games played by a team in a specific season.

prompt1 = 'Which season would you like to see (2009 to 2019)? '
prompt2 = 'Which team would you like to see?: '


team_listing = gp_roster_max_games['TEAM'].unique()

print('Here are the teams to choose from: ', team_listing)
print(100*'-')

while True:
    season_pass = int(input(prompt1))
    if season_pass not in season_listing:
        print('Invalid Season')
        continue

    team_selected = input(prompt2)
    if team_selected.title() not in team_listing:
        print('Invalid team choosen')
        continue
    
    print(100*'-')
    print(gp_roster_max_games.loc[(gp_roster_max_games['SEASON'] == season_pass) & (gp_roster_max_games['TEAM'] == team_selected.title())])
    break

Here are the teams to choose from:  ['Atlanta' 'Boston' 'Cleveland' 'New Orleans' 'Chicago' 'Dallas' 'Denver'
 'Golden State' 'Houston' 'L.A. Clippers' 'LA Clippers' 'L.A. Lakers'
 'Miami' 'Milwaukee' 'Minnesota' 'New Jersey' 'Brooklyn' 'New York'
 'Orlando' 'Indiana' 'Philadelphia' 'Phoenix' 'Portland' 'Sacramento'
 'San Antonio' 'Oklahoma City' 'Toronto' 'Utah' 'Memphis' 'Washington'
 'Detroit' 'Charlotte']
----------------------------------------------------------------------------------------------------
Which season would you like to see (2009 to 2019)? 2009
Which team would you like to see?: boston
----------------------------------------------------------------------------------------------------
       TEAM_ID  SEASON CONFERENCE    TEAM   G
11  1610612738    2009       East  Boston  82


In [None]:
# Let's save the team roster for later use

mod_team_roster.to_csv('team_roster_2009_2019.csv')

In [30]:
# Read in csv file with a previously prepared file. See file titled 'Data Preparation Part1'.

gms_df = pd.read_csv('games_mod.csv')
display(gms_df.head(5))

Unnamed: 0,GAME_DATE,GM_YR,GM_MTH,GM_DAY,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,PTS_home,FG_PCT_home,...,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022-03-12,2022,3,12,22101005,1610612748,1610612750,2021,104.0,0.398,...,0.333,23.0,53.0,113.0,0.422,0.875,0.357,21.0,46.0,0
1,2022-03-12,2022,3,12,22101006,1610612741,1610612739,2021,101.0,0.443,...,0.429,20.0,46.0,91.0,0.419,0.824,0.208,19.0,40.0,1
2,2022-03-12,2022,3,12,22101007,1610612759,1610612754,2021,108.0,0.412,...,0.324,28.0,52.0,119.0,0.489,1.0,0.389,23.0,47.0,0
3,2022-03-12,2022,3,12,22101008,1610612744,1610612749,2021,122.0,0.484,...,0.4,33.0,55.0,109.0,0.413,0.696,0.386,27.0,39.0,1
4,2022-03-12,2022,3,12,22101009,1610612743,1610612761,2021,115.0,0.551,...,0.407,32.0,39.0,127.0,0.471,0.76,0.387,28.0,50.0,0


In [42]:
# How many non-null values are there? This will be used to compare the gms_df to the tpr_df later on.

gms_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25697 entries, 0 to 25696
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   GAME_DATE        25697 non-null  object 
 1   GM_YR            25697 non-null  int64  
 2   GM_MTH           25697 non-null  int64  
 3   GM_DAY           25697 non-null  int64  
 4   GAME_ID          25697 non-null  int64  
 5   HOME_TEAM_ID     25697 non-null  int64  
 6   VISITOR_TEAM_ID  25697 non-null  int64  
 7   SEASON           25697 non-null  int64  
 8   PTS_home         25697 non-null  float64
 9   FG_PCT_home      25697 non-null  float64
 10  FT_PCT_home      25697 non-null  float64
 11  FG3_PCT_home     25697 non-null  float64
 12  AST_home         25697 non-null  float64
 13  REB_home         25697 non-null  float64
 14  PTS_away         25697 non-null  float64
 15  FG_PCT_away      25697 non-null  float64
 16  FT_PCT_away      25697 non-null  float64
 17  FG3_PCT_away

In [31]:
# Let's investigate the gms_df dataset. How many unique teams are there under the HOME_TEAM_ID column in the gms_df?

gms_df['HOME_TEAM_ID'].nunique()

30

In [32]:
# Let's investigate the tpr_df dataset.How many unique teams are there under the TEAM_ID column in the tpr_df?

tpr_df['TEAM_ID'].nunique()

30

In [41]:
# Are the team IDs similar between the two datasets based on number of non null values?

len(np.isin(gms_df['HOME_TEAM_ID'], tpr_df['TEAM_ID']))

25697

In [43]:
# Are the unique team IDs similar between the two datasets?

len(np.intersect1d(tpr_df['TEAM_ID'], gms_df['HOME_TEAM_ID']))

30

In [37]:
# Let's create a dictionary to store the Team IDs, Team name and their indicies.

team_dict = { }
index_list = [ ]
team_list = [ ]
team_id_list = [ ]

for tm_id in unique_teams:
    tm_index = tpr_df.index[tpr_df["TEAM_ID"] == tm_id][0] # Gets the 1st instance of the id.
    tm = tpr_df['TEAM'].iloc[tm_index]
    
    index_list.append(tm_index)
    team_id_list.append(tm_id)
    team_list.append(tm)


team_dict['Index'] = index_list
team_dict['Team_ID'] = team_id_list
team_dict['Team'] = team_list

print(team_dict)

{'Index': [0, 161, 322, 483, 644, 805, 966, 1127, 1288, 1449, 1610, 1771, 1932, 2093, 2254, 2415, 2576, 2737, 2898, 3059, 3220, 3381, 3542, 3703, 3864, 4025, 4186, 4347, 4508, 4669], 'Team_ID': [1610612756, 1610612744, 1610612763, 1610612762, 1610612742, 1610612743, 1610612750, 1610612746, 1610612747, 1610612740, 1610612757, 1610612759, 1610612758, 1610612760, 1610612745, 1610612748, 1610612749, 1610612755, 1610612741, 1610612738, 1610612739, 1610612761, 1610612751, 1610612766, 1610612737, 1610612764, 1610612752, 1610612754, 1610612765, 1610612753], 'Team': ['Phoenix', 'Golden State', 'Memphis', 'Utah', 'Dallas', 'Denver', 'Minnesota', 'LA Clippers', 'L.A. Lakers', 'New Orleans', 'Portland', 'San Antonio', 'Sacramento', 'Oklahoma City', 'Houston', 'Miami', 'Milwaukee', 'Philadelphia', 'Chicago', 'Boston', 'Cleveland', 'Toronto', 'Brooklyn', 'Charlotte', 'Atlanta', 'Washington', 'New York', 'Indiana', 'Detroit', 'Orlando']}


In [38]:
# Let's sort the dictionary based on it's keys for fun.

sorted(team_dict.items())

[('Index',
  [0,
   161,
   322,
   483,
   644,
   805,
   966,
   1127,
   1288,
   1449,
   1610,
   1771,
   1932,
   2093,
   2254,
   2415,
   2576,
   2737,
   2898,
   3059,
   3220,
   3381,
   3542,
   3703,
   3864,
   4025,
   4186,
   4347,
   4508,
   4669]),
 ('Team',
  ['Phoenix',
   'Golden State',
   'Memphis',
   'Utah',
   'Dallas',
   'Denver',
   'Minnesota',
   'LA Clippers',
   'L.A. Lakers',
   'New Orleans',
   'Portland',
   'San Antonio',
   'Sacramento',
   'Oklahoma City',
   'Houston',
   'Miami',
   'Milwaukee',
   'Philadelphia',
   'Chicago',
   'Boston',
   'Cleveland',
   'Toronto',
   'Brooklyn',
   'Charlotte',
   'Atlanta',
   'Washington',
   'New York',
   'Indiana',
   'Detroit',
   'Orlando']),
 ('Team_ID',
  [1610612756,
   1610612744,
   1610612763,
   1610612762,
   1610612742,
   1610612743,
   1610612750,
   1610612746,
   1610612747,
   1610612740,
   1610612757,
   1610612759,
   1610612758,
   1610612760,
   1610612745,
   1610612748,


In [39]:
# We get a better look at the keys and values within the dictionary.

for key, value in team_dict.items():
    print(f'For key \'{key}\' in the dictionary here are it\'s values:')
    print('\n', value)
    print()

For key 'Index' in the dictionary here are it's values:

 [0, 161, 322, 483, 644, 805, 966, 1127, 1288, 1449, 1610, 1771, 1932, 2093, 2254, 2415, 2576, 2737, 2898, 3059, 3220, 3381, 3542, 3703, 3864, 4025, 4186, 4347, 4508, 4669]

For key 'Team_ID' in the dictionary here are it's values:

 [1610612756, 1610612744, 1610612763, 1610612762, 1610612742, 1610612743, 1610612750, 1610612746, 1610612747, 1610612740, 1610612757, 1610612759, 1610612758, 1610612760, 1610612745, 1610612748, 1610612749, 1610612755, 1610612741, 1610612738, 1610612739, 1610612761, 1610612751, 1610612766, 1610612737, 1610612764, 1610612752, 1610612754, 1610612765, 1610612753]

For key 'Team' in the dictionary here are it's values:

 ['Phoenix', 'Golden State', 'Memphis', 'Utah', 'Dallas', 'Denver', 'Minnesota', 'LA Clippers', 'L.A. Lakers', 'New Orleans', 'Portland', 'San Antonio', 'Sacramento', 'Oklahoma City', 'Houston', 'Miami', 'Milwaukee', 'Philadelphia', 'Chicago', 'Boston', 'Cleveland', 'Toronto', 'Brooklyn', '

In [40]:
#Let's place the dictionary inside a dataframe for future use.

team_dictionary = pd.DataFrame(team_dict)
team_dictionary
team_dictionary.to_csv('team_dictionary.csv')

In [44]:
# Quickly relook the gms_df dataset.

gms_df.head(5)

Unnamed: 0,GAME_DATE,GM_YR,GM_MTH,GM_DAY,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,PTS_home,FG_PCT_home,...,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022-03-12,2022,3,12,22101005,1610612748,1610612750,2021,104.0,0.398,...,0.333,23.0,53.0,113.0,0.422,0.875,0.357,21.0,46.0,0
1,2022-03-12,2022,3,12,22101006,1610612741,1610612739,2021,101.0,0.443,...,0.429,20.0,46.0,91.0,0.419,0.824,0.208,19.0,40.0,1
2,2022-03-12,2022,3,12,22101007,1610612759,1610612754,2021,108.0,0.412,...,0.324,28.0,52.0,119.0,0.489,1.0,0.389,23.0,47.0,0
3,2022-03-12,2022,3,12,22101008,1610612744,1610612749,2021,122.0,0.484,...,0.4,33.0,55.0,109.0,0.413,0.696,0.386,27.0,39.0,1
4,2022-03-12,2022,3,12,22101009,1610612743,1610612761,2021,115.0,0.551,...,0.407,32.0,39.0,127.0,0.471,0.76,0.387,28.0,50.0,0


In [45]:
#Let's create a function to find the team name based on the team ID. Here we use the dictionary created previously.
    
def team_name_add(game_team_id):
    for num in team_id_list: #loop through the TEAM_ID list from the dictionary.
        if num == game_team_id: # game_team_id is the home_team_id from the gms_df.
            my_id = team_id_list.index(num) # find the corresponding index of the id from the dictionary.
            my_team_name = team_list[my_id] # get the associated team name from the dictionary.
            return my_team_name

In [46]:
# Let's insert the home team names into the gms_df.
team_name_col = list(map(team_name_add, gms_df['HOME_TEAM_ID']))
gms_df.insert(6, 'HOME_TEAM_NAME', team_name_col)
gms_df.head()

Unnamed: 0,GAME_DATE,GM_YR,GM_MTH,GM_DAY,GAME_ID,HOME_TEAM_ID,HOME_TEAM_NAME,VISITOR_TEAM_ID,SEASON,PTS_home,...,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022-03-12,2022,3,12,22101005,1610612748,Miami,1610612750,2021,104.0,...,0.333,23.0,53.0,113.0,0.422,0.875,0.357,21.0,46.0,0
1,2022-03-12,2022,3,12,22101006,1610612741,Chicago,1610612739,2021,101.0,...,0.429,20.0,46.0,91.0,0.419,0.824,0.208,19.0,40.0,1
2,2022-03-12,2022,3,12,22101007,1610612759,San Antonio,1610612754,2021,108.0,...,0.324,28.0,52.0,119.0,0.489,1.0,0.389,23.0,47.0,0
3,2022-03-12,2022,3,12,22101008,1610612744,Golden State,1610612749,2021,122.0,...,0.4,33.0,55.0,109.0,0.413,0.696,0.386,27.0,39.0,1
4,2022-03-12,2022,3,12,22101009,1610612743,Denver,1610612761,2021,115.0,...,0.407,32.0,39.0,127.0,0.471,0.76,0.387,28.0,50.0,0


In [47]:
#Let's insert a new column with the team names based on the VISITOR_TEAM_ID.

visitor_team_name_col = list(map(team_name_add, gms_df['VISITOR_TEAM_ID']))
gms_df.insert(8, 'VISITOR_TEAM_NAME', visitor_team_name_col)
gms_df.head()

Unnamed: 0,GAME_DATE,GM_YR,GM_MTH,GM_DAY,GAME_ID,HOME_TEAM_ID,HOME_TEAM_NAME,VISITOR_TEAM_ID,VISITOR_TEAM_NAME,SEASON,...,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022-03-12,2022,3,12,22101005,1610612748,Miami,1610612750,Minnesota,2021,...,0.333,23.0,53.0,113.0,0.422,0.875,0.357,21.0,46.0,0
1,2022-03-12,2022,3,12,22101006,1610612741,Chicago,1610612739,Cleveland,2021,...,0.429,20.0,46.0,91.0,0.419,0.824,0.208,19.0,40.0,1
2,2022-03-12,2022,3,12,22101007,1610612759,San Antonio,1610612754,Indiana,2021,...,0.324,28.0,52.0,119.0,0.489,1.0,0.389,23.0,47.0,0
3,2022-03-12,2022,3,12,22101008,1610612744,Golden State,1610612749,Milwaukee,2021,...,0.4,33.0,55.0,109.0,0.413,0.696,0.386,27.0,39.0,1
4,2022-03-12,2022,3,12,22101009,1610612743,Denver,1610612761,Toronto,2021,...,0.407,32.0,39.0,127.0,0.471,0.76,0.387,28.0,50.0,0


In [48]:
# Now let's see how many values the new columns have.  

gms_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25697 entries, 0 to 25696
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GAME_DATE          25697 non-null  object 
 1   GM_YR              25697 non-null  int64  
 2   GM_MTH             25697 non-null  int64  
 3   GM_DAY             25697 non-null  int64  
 4   GAME_ID            25697 non-null  int64  
 5   HOME_TEAM_ID       25697 non-null  int64  
 6   HOME_TEAM_NAME     25697 non-null  object 
 7   VISITOR_TEAM_ID    25697 non-null  int64  
 8   VISITOR_TEAM_NAME  25697 non-null  object 
 9   SEASON             25697 non-null  int64  
 10  PTS_home           25697 non-null  float64
 11  FG_PCT_home        25697 non-null  float64
 12  FT_PCT_home        25697 non-null  float64
 13  FG3_PCT_home       25697 non-null  float64
 14  AST_home           25697 non-null  float64
 15  REB_home           25697 non-null  float64
 16  PTS_away           256

In [49]:
# Are there any duplicates inside the 'GAME_ID' column?

print(gms_df['GAME_ID'].nunique())
print()
print(len(gms_df['GAME_ID']))

25668

25697


In [50]:
# Are there any null values inside the 'GAME_ID' column?

gms_df['GAME_ID'].isna().sum()

0

In [51]:
#Approach 1: There appears to be 29 duplicate elements under the GAME_ID column. Let's find those rows.

from collections import Counter

arr = gms_df['GAME_ID']

count_dict = Counter(arr)
#print(count_dict)
#count_dict.keys()

duplicate_list = list([key_id for key_id in count_dict.keys() if count_dict[key_id] > 1])

In [52]:
# Let's sort the duplicate_list.

print(sorted(duplicate_list))

[22000006, 22000007, 22000011, 22000013, 22000015, 22000018, 22000021, 22000022, 22000027, 22000028, 22000030, 22000032, 22000034, 22000037, 22000038, 22000042, 22000044, 22000046, 22000049, 22000051, 22000053, 22000055, 22000057, 22000065, 22000067, 22000070, 22000072, 22000076, 22000077]


In [53]:
# What is the length of the list of duplicate games?

len(duplicate_list)

29

In [None]:
#print(count_dict)

In [54]:
# Let's investigate the duplicates under the game_id to understand further.

while True:
    prompt = 'Enter 8-digit game ID: '
    gm_id_focus = int(input(prompt))
    if len(str(gm_id_focus)) == 8:
        display(gms_df[gms_df['GAME_ID'] == gm_id_focus])
        exit_prompt = input('Would you like to try again (Y/ N)? ')
        if exit_prompt.upper() == 'Y':
            continue
        else:
            break
    else:
        print('Invalid ID')
        continue

Enter 8-digit game ID: 22000006


Unnamed: 0,GAME_DATE,GM_YR,GM_MTH,GM_DAY,GAME_ID,HOME_TEAM_ID,HOME_TEAM_NAME,VISITOR_TEAM_ID,VISITOR_TEAM_NAME,SEASON,...,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
1680,2020-12-25,2020,12,25,22000006,1610612749,Milwaukee,1610612744,Golden State,2020,...,0.556,25.0,60.0,99.0,0.343,0.84,0.222,14.0,43.0,1
2270,2020-12-25,2020,12,25,22000006,1610612749,Milwaukee,1610612744,Golden State,2020,...,0.556,25.0,60.0,99.0,0.343,0.84,0.222,14.0,43.0,1


Would you like to try again (Y/ N)? n


In [55]:
# Let's remove the second duplicate row for the GAME_ID.

gms_df.drop_duplicates(subset=['GAME_ID'], keep='first', inplace=True)

In [56]:
# Let's check the new shape of the dataset.

gms_df.shape

(25668, 23)

In [None]:
# Let's save the cleaned gms_df to a new file name.

gms_df.to_csv('game_boxscores.csv')

End of Code