In [1]:
import pandas as pd
import numpy as nop
import sqlite3

In [2]:
nba = pd.read_csv('https://raw.githubusercontent.com/jkropko/contrans/main/examples/ASA%20All%20NBA%20Raw%20Data.csv')

In [3]:
nba.columns

Index(['game_id', 'game_date', 'OT', 'H_A', 'Team_Abbrev', 'Team_Score',
       'Team_pace', 'Team_efg_pct', 'Team_tov_pct', 'Team_orb_pct',
       'Team_ft_rate', 'Team_off_rtg', 'Inactives', 'Opponent_Abbrev',
       'Opponent_Score', 'Opponent_pace', 'Opponent_efg_pct',
       'Opponent_tov_pct', 'Opponent_orb_pct', 'Opponent_ft_rate',
       'Opponent_off_rtg', 'player', 'player_id', 'starter', 'mp', 'fg', 'fga',
       'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb',
       'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus',
       'did_not_play', 'is_inactive', 'ts_pct', 'efg_pct', 'fg3a_per_fga_pct',
       'fta_per_fga_pct', 'orb_pct', 'drb_pct', 'trb_pct', 'ast_pct',
       'stl_pct', 'blk_pct', 'tov_pct', 'usg_pct', 'off_rtg', 'def_rtg', 'bpm',
       'season', 'minutes', 'double_double', 'triple_double', 'DKP', 'FDP',
       'SDP', 'DKP_per_minute', 'FDP_per_minute', 'SDP_per_minute',
       'pf_per_minute', 'ts', 'last_60_minutes_per_game_s

In [4]:
pd.set_option('display.max_rows', 90)
nba.head(3).T

Unnamed: 0,0,1,2
game_id,202204100BRK,202204100BRK,202204100BRK
game_date,2022-04-10,2022-04-10,2022-04-10
OT,0,0,0
H_A,A,A,A
Team_Abbrev,IND,IND,IND
Team_Score,126,126,126
Team_pace,103.9,103.9,103.9
Team_efg_pct,0.543,0.543,0.543
Team_tov_pct,5.9,5.9,5.9
Team_orb_pct,20.8,20.8,20.8


## Narrow Data to Columns We Care About

In [5]:
nba = nba[['game_id', 'game_date', 'OT', 'H_A', 'Team_Abbrev', 'Team_Score',
       'Team_pace', 'Team_efg_pct', 'Team_tov_pct', 'Team_orb_pct',
       'Team_ft_rate', 'Team_off_rtg', 'Inactives', 'Opponent_Abbrev',
       'player', 'player_id', 'starter', 'mp', 'fg', 'fga',
       'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb',
       'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus',
       'did_not_play', 'is_inactive', 'off_rtg', 'def_rtg', 'bpm']]

In [6]:
pd.set_option('display.max_rows', 81)
nba.head(3).T

Unnamed: 0,0,1,2
game_id,202204100BRK,202204100BRK,202204100BRK
game_date,2022-04-10,2022-04-10,2022-04-10
OT,0,0,0
H_A,A,A,A
Team_Abbrev,IND,IND,IND
Team_Score,126,126,126
Team_pace,103.9,103.9,103.9
Team_efg_pct,0.543,0.543,0.543
Team_tov_pct,5.9,5.9,5.9
Team_orb_pct,20.8,20.8,20.8


## First Normal Form
- superkey is player_id and game_id : have a primary key
- Non-atomic data? Yes- the inactives list
    - We have this inactives list in the inactive column (a 1 or 0 entry) so we can just drop it
- No repeating groups? No because we dealt with the non-atomic data already

Let's fix the atomic data problem:

In [7]:
nba= nba.drop(['Inactives'], axis =1)
#Drops inactives column

In [8]:
nba.head(3).T

Unnamed: 0,0,1,2
game_id,202204100BRK,202204100BRK,202204100BRK
game_date,2022-04-10,2022-04-10,2022-04-10
OT,0,0,0
H_A,A,A,A
Team_Abbrev,IND,IND,IND
Team_Score,126,126,126
Team_pace,103.9,103.9,103.9
Team_efg_pct,0.543,0.543,0.543
Team_tov_pct,5.9,5.9,5.9
Team_orb_pct,20.8,20.8,20.8


## Second Normal Form
- Every non-prime column must depend on the entire primary key (player_id and game_id combo) and not just PART of the primary key (ex. just game_id OR just player_id)
    - False because we have stuff for just game and not player- ex. did game go into overtime? Just depends on game and not the player
    - Can only depend on part of primary key if your primary key is more than one column
    - Deal with this in a few different ways 
        - create one single column that is player and game id together (replace lots of columns with one column)
            - Now we have one primary key and that is the game_player_id
            - Therefore game_id and player_id are both non-prime 

In [9]:
nba['game_player_id'] = nba['game_id'] + '_' + nba['player_id']

In [10]:
nba.head(3).T

Unnamed: 0,0,1,2
game_id,202204100BRK,202204100BRK,202204100BRK
game_date,2022-04-10,2022-04-10,2022-04-10
OT,0,0,0
H_A,A,A,A
Team_Abbrev,IND,IND,IND
Team_Score,126,126,126
Team_pace,103.9,103.9,103.9
Team_efg_pct,0.543,0.543,0.543
Team_tov_pct,5.9,5.9,5.9
Team_orb_pct,20.8,20.8,20.8


## Third Normal Form 
### Part 1
- Calculated columns:
    - Should remove them because we can recalculate them if we need them 
    - Ex. 
        - fg_pct
        - fg3_pct
        - ft_pct
        - trb

In [12]:
nba = nba.drop(['fg_pct', 'fg3_pct', 'ft_pct', 'trb'], axis=1)

### Part 2
- No transitive dependencies. List of them:
    - Some columns depend on player
        - plyaer name
    - Some columns depend on game
        - game_date
    - Some columns depend on team 
        - Team_Abbrev
    - Some columns depend on team + game
        - Team_Score
    - Each of these will get a separate table
    - What will be left is the player + game table

In [13]:
nba.head(3).T
#example of transitive dependency- game date depends on game id - one game has one date

Unnamed: 0,0,1,2
game_id,202204100BRK,202204100BRK,202204100BRK
game_date,2022-04-10,2022-04-10,2022-04-10
OT,0,0,0
H_A,A,A,A
Team_Abbrev,IND,IND,IND
Team_Score,126,126,126
Team_pace,103.9,103.9,103.9
Team_efg_pct,0.543,0.543,0.543
Team_tov_pct,5.9,5.9,5.9
Team_orb_pct,20.8,20.8,20.8


In [20]:
nba_teamgame = nba[["Team_Abbrev", "game_id", "H_A", "Team_Score", "Team_pace", "Team_efg_pct", "Team_tov_pct", "Team_ft_rate", "Team_off_rtg","Opponent_Abbrev"]]
nba_teamgame

Unnamed: 0,Team_Abbrev,game_id,H_A,Team_Score,Team_pace,Team_efg_pct,Team_tov_pct,Team_ft_rate,Team_off_rtg,Opponent_Abbrev
0,IND,202204100BRK,A,126,103.9,0.543,5.9,0.125,121.3,BRK
1,IND,202204100BRK,A,126,103.9,0.543,5.9,0.125,121.3,BRK
2,IND,202204100BRK,A,126,103.9,0.543,5.9,0.125,121.3,BRK
3,IND,202204100BRK,A,126,103.9,0.543,5.9,0.125,121.3,BRK
4,IND,202204100BRK,A,126,103.9,0.543,5.9,0.125,121.3,BRK
...,...,...,...,...,...,...,...,...,...,...
31603,LAC,202201130NOP,A,89,97.0,0.444,14.3,0.210,91.7,NOP
31604,LAC,202201150SAS,A,94,92.7,0.440,11.9,0.060,101.4,SAS
31605,LAC,202112220SAC,A,105,95.5,0.555,11.9,0.171,110.0,SAC
31606,LAC,202112260LAC,H,100,99.8,0.512,13.0,0.140,100.2,DEN


In [22]:
#Have duplicated columns
nba_teamgame = nba_teamgame.drop_duplicates()

In [23]:
nba_teamgame

Unnamed: 0,Team_Abbrev,game_id,H_A,Team_Score,Team_pace,Team_efg_pct,Team_tov_pct,Team_ft_rate,Team_off_rtg,Opponent_Abbrev
0,IND,202204100BRK,A,126,103.9,0.543,5.9,0.125,121.3,BRK
12,BRK,202204100BRK,H,134,103.9,0.691,17.9,0.272,129.0,IND
25,WAS,202204100CHO,A,108,97.7,0.489,8.7,0.170,110.5,CHO
37,CHO,202204100CHO,H,124,97.7,0.640,15.2,0.112,126.9,WAS
52,MIL,202204100CLE,A,115,101.9,0.511,10.5,0.284,112.9,CLE
...,...,...,...,...,...,...,...,...,...,...
21491,MIN,202112190MIN,H,111,91.8,0.565,9.0,0.312,120.9,DAL
21492,MIN,202112210DAL,A,102,93.9,0.538,16.2,0.215,108.6,DAL
21629,MIN,202112280MIN,H,88,93.1,0.441,9.6,0.153,94.5,NYK
21708,MIN,202112230UTA,A,116,102.1,0.530,11.6,0.089,113.7,UTA


In [25]:
#columns that depend on game
nba_game = nba[['game_id', 'OT', 'game_date']].drop_duplicates()


In [26]:
nba_players = nba[['player_id', 'player']].drop_duplicates()

In [32]:
nba_playergame = nba.drop([ 'player','OT', 'game_date','Team_orb_pct', "H_A",'Team_Abbrev', "Team_Score", "Team_pace", "Team_efg_pct", "Team_tov_pct", "Team_ft_rate", "Team_off_rtg", "Opponent_Abbrev" ], axis = 1)

In [33]:
nba_playergame

Unnamed: 0,game_id,player_id,starter,mp,fg,fga,fg3,fg3a,ft,fta,...,tov,pf,pts,plus_minus,did_not_play,is_inactive,off_rtg,def_rtg,bpm,game_player_id
0,202204100BRK,halibty01,1,39:28,7,14,2,5,1,1,...,1,0,17,-9,0,0,137,132,1.7,202204100BRK_halibty01
1,202204100BRK,hieldbu01,1,35:53,8,23,5,14,0,0,...,2,3,21,0,0,0,94,128,-2.3,202204100BRK_hieldbu01
2,202204100BRK,brissos01,1,35:47,10,20,5,10,3,4,...,0,5,28,-9,0,0,137,133,4.4,202204100BRK_brissos01
3,202204100BRK,jacksis01,1,32:01,3,4,0,0,1,2,...,2,5,7,3,0,0,89,128,-9.2,202204100BRK_jacksis01
4,202204100BRK,mccontj01,1,30:52,5,15,3,7,1,2,...,0,3,14,7,0,0,104,126,-1.7,202204100BRK_mccontj01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31603,202201130NOP,gabriwe01,0,4:26,1,1,1,1,0,0,...,2,2,3,-4,0,0,62,110,-6.4,202201130NOP_gabriwe01
31604,202201150SAS,gabriwe01,0,0:00,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0.0,202201150SAS_gabriwe01
31605,202112220SAC,wrighmo01,0,1:28,0,0,0,0,0,0,...,0,0,0,1,0,0,217,103,24.4,202112220SAC_wrighmo01
31606,202112260LAC,wrighmo01,0,0:00,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0.0,202112260LAC_wrighmo01


In [34]:
nba_game

Unnamed: 0,game_id,OT,game_date
0,202204100BRK,0,2022-04-10
25,202204100CHO,0,2022-04-10
52,202204100CLE,0,2022-04-10
77,202204100DAL,0,2022-04-10
103,202204100DEN,1,2022-04-10
...,...,...,...
19708,202110300MIN,0,2021-10-30
19726,202112150DEN,0,2021-12-15
19748,202202010MIN,0,2022-02-01
20615,202203270BOS,0,2022-03-27


In [36]:
nba_players

Unnamed: 0,player_id,player
0,halibty01,Tyrese Haliburton
1,hieldbu01,Buddy Hield
2,brissos01,Oshae Brissett
3,jacksis01,Isaiah Jackson
4,mccontj01,T.J. McConnell
...,...,...
31515,garrema01,Marcus Garrett
31535,chalmma01,Mario Chalmers
31538,holmaar01,Aric Holman
31540,scrubja01,Jay Scrubb


In [37]:
nba_teamgame

Unnamed: 0,Team_Abbrev,game_id,H_A,Team_Score,Team_pace,Team_efg_pct,Team_tov_pct,Team_ft_rate,Team_off_rtg,Opponent_Abbrev
0,IND,202204100BRK,A,126,103.9,0.543,5.9,0.125,121.3,BRK
12,BRK,202204100BRK,H,134,103.9,0.691,17.9,0.272,129.0,IND
25,WAS,202204100CHO,A,108,97.7,0.489,8.7,0.170,110.5,CHO
37,CHO,202204100CHO,H,124,97.7,0.640,15.2,0.112,126.9,WAS
52,MIL,202204100CLE,A,115,101.9,0.511,10.5,0.284,112.9,CLE
...,...,...,...,...,...,...,...,...,...,...
21491,MIN,202112190MIN,H,111,91.8,0.565,9.0,0.312,120.9,DAL
21492,MIN,202112210DAL,A,102,93.9,0.538,16.2,0.215,108.6,DAL
21629,MIN,202112280MIN,H,88,93.1,0.441,9.6,0.153,94.5,NYK
21708,MIN,202112230UTA,A,116,102.1,0.530,11.6,0.089,113.7,UTA


In [38]:
nba_playergame

Unnamed: 0,game_id,player_id,starter,mp,fg,fga,fg3,fg3a,ft,fta,...,tov,pf,pts,plus_minus,did_not_play,is_inactive,off_rtg,def_rtg,bpm,game_player_id
0,202204100BRK,halibty01,1,39:28,7,14,2,5,1,1,...,1,0,17,-9,0,0,137,132,1.7,202204100BRK_halibty01
1,202204100BRK,hieldbu01,1,35:53,8,23,5,14,0,0,...,2,3,21,0,0,0,94,128,-2.3,202204100BRK_hieldbu01
2,202204100BRK,brissos01,1,35:47,10,20,5,10,3,4,...,0,5,28,-9,0,0,137,133,4.4,202204100BRK_brissos01
3,202204100BRK,jacksis01,1,32:01,3,4,0,0,1,2,...,2,5,7,3,0,0,89,128,-9.2,202204100BRK_jacksis01
4,202204100BRK,mccontj01,1,30:52,5,15,3,7,1,2,...,0,3,14,7,0,0,104,126,-1.7,202204100BRK_mccontj01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31603,202201130NOP,gabriwe01,0,4:26,1,1,1,1,0,0,...,2,2,3,-4,0,0,62,110,-6.4,202201130NOP_gabriwe01
31604,202201150SAS,gabriwe01,0,0:00,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0.0,202201150SAS_gabriwe01
31605,202112220SAC,wrighmo01,0,1:28,0,0,0,0,0,0,...,0,0,0,1,0,0,217,103,24.4,202112220SAC_wrighmo01
31606,202112260LAC,wrighmo01,0,0:00,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0.0,202112260LAC_wrighmo01


### Buidling Database

In [39]:
nba_db = sqlite3.connect("nba.db")
#creates empty DB file in your folder

In [41]:
nba_game.to_sql('games', nba_db, index=False, chunksize = 1000, if_exists='replace')
#chunksize is an operation python needs to do to give 1000 rows at a time, 
#the bigger the chunksize the faster the operation
nba_playergame.to_sql('playergame', nba_db, index=False, chunksize = 1000, if_exists='replace')
nba_teamgame.to_sql('teamgame', nba_db, index=False, chunksize = 1000, if_exists='replace')
nba_players.to_sql('players', nba_db, index=False, chunksize = 1000, if_exists='replace')

621

In [42]:
#621 is number of rows in the last file 

#### queries


In [44]:
myquery = '''
SELECT * 
FROM teamgame
WHERE Team_Abbrev='CLE'
'''
#give all columns from team game table where abbrev is cleveland

pd.read_sql(myquery, nba_db)

Unnamed: 0,Team_Abbrev,game_id,H_A,Team_Score,Team_pace,Team_efg_pct,Team_tov_pct,Team_ft_rate,Team_off_rtg,Opponent_Abbrev
0,CLE,202204100CLE,H,133,101.9,0.644,9.0,0.128,130.5,MIL
1,CLE,202204080BRK,A,107,89.6,0.537,8.0,0.232,119.4,BRK
2,CLE,202204050ORL,A,115,96.8,0.581,10.3,0.174,118.8,ORL
3,CLE,202204030CLE,H,108,95.0,0.545,8.9,0.295,113.7,PHI
4,CLE,202204020NYK,A,119,92.1,0.636,9.1,0.198,129.2,NYK
...,...,...,...,...,...,...,...,...,...,...
77,CLE,202112300WAS,A,93,94.2,0.470,11.4,0.181,98.7,WAS
78,CLE,202112310CLE,H,118,93.2,0.619,8.7,0.167,126.7,ATL
79,CLE,202201020CLE,H,108,92.8,0.500,7.4,0.227,116.4,IND
80,CLE,202201310CLE,H,93,88.5,0.482,9.1,0.143,105.0,NOP


## Build ER Diagram

In [51]:
nba_playergame.columns

Index(['game_id', 'player_id', 'starter', 'mp', 'fg', 'fga', 'fg3', 'fg3a',
       'ft', 'fta', 'orb', 'drb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts',
       'plus_minus', 'did_not_play', 'is_inactive', 'off_rtg', 'def_rtg',
       'bpm', 'game_player_id'],
      dtype='object')