<h1>Clean CSV Data and Add to SQL Database</h1>

<h4>Import Dependencies</h4>

In [1]:
import os
import csv
import pandas as pd

import sqlite3
import csv
from sqlalchemy import create_engine

from config import password

<h4>Create paths to CSV files</h4>

In [2]:
pathTeams = os.path.join("..", "..", "data", "csv", "Teams.csv")
pathBatting = os.path.join("..", "..", "data", "csv", "Batting.csv")
pathPitching = os.path.join("..", "..", "data", "csv", "Pitching.csv")
pathPlayers = os.path.join("..", "..", "data", "csv", "People.csv")
pathFranchises = os.path.join("..", "..", "data", "csv", "TeamsFranchises.csv")
pathSalaries = os.path.join("..", "..", "data", "csv", "Salaries.csv")

<h4>Load CSV files into DataFrame and clean</h4>

In [3]:
# Read csv into a DataFrame
teamsDF = pd.read_csv(pathTeams)
# Trim off unwanted columns
teamsCols = ["yearID", "teamID", "franchID", "G", "W", "R", "H", "HR", "BB", "SO", "SB", "RA", "ERA", "HA", "HRA", "BBA", "SOA"]
teamsDF = teamsDF[teamsCols]
teamsDF.head()

Unnamed: 0,yearID,teamID,franchID,G,W,R,H,HR,BB,SO,SB,RA,ERA,HA,HRA,BBA,SOA
0,1871,BS1,BNA,31,20,401,426,3,60.0,19.0,73.0,303,3.55,367,2,42,23
1,1871,CH1,CNA,28,19,302,323,10,60.0,22.0,69.0,241,2.76,308,6,28,22
2,1871,CL1,CFC,29,10,249,328,7,26.0,25.0,18.0,341,4.11,346,13,53,34
3,1871,FW1,KEK,19,7,137,178,2,33.0,9.0,16.0,243,5.17,261,5,21,17
4,1871,NY2,NNA,33,16,302,403,1,33.0,15.0,46.0,313,3.72,373,7,42,22


In [4]:
# Read csv into a DataFrame
battingDF = pd.read_csv(pathBatting)
# Trim off unwanted columns
battingCols = ["playerID", "yearID","teamID", "stint", "G", "R", "H", "HR", "BB", "IBB", "SO", "SB"]
battingDF = battingDF[battingCols]
battingDF.head()

Unnamed: 0,playerID,yearID,teamID,stint,G,R,H,HR,BB,IBB,SO,SB
0,abercda01,1871,TRO,1,1,0,0,0,0,,0.0,0.0
1,addybo01,1871,RC1,1,25,30,32,0,4,,0.0,8.0
2,allisar01,1871,CL1,1,29,28,40,0,2,,5.0,3.0
3,allisdo01,1871,WS3,1,27,28,44,2,0,,2.0,1.0
4,ansonca01,1871,RC1,1,25,29,39,0,2,,1.0,6.0


In [5]:
# Read csv into a DataFrame
pitchingDF = pd.read_csv(pathPitching)
# Trim off unwanted columns
pitchingCols = ["playerID", "yearID", "teamID","stint", "G", "H", "HR", "BB", "SO", "ERA", "R"]
pitchingDF = pitchingDF[pitchingCols]
pitchingDF.head()

Unnamed: 0,playerID,yearID,teamID,stint,G,H,HR,BB,SO,ERA,R
0,bechtge01,1871,PH1,1,3,43,0,11,1,7.96,42
1,brainas01,1871,WS3,1,30,361,4,37,13,4.5,292
2,fergubo01,1871,NY2,1,1,8,0,0,0,27.0,9
3,fishech01,1871,RC1,1,24,295,3,31,15,4.35,257
4,fleetfr01,1871,NY2,1,1,20,0,3,0,10.0,21


In [6]:
# Read csv into a DataFrame
playersDF = pd.read_csv(pathPlayers)
# Trim off unwanted columns
playersCols = ["playerID", "birthYear", "nameFirst", "nameLast", "debut", "finalGame"]
playersDF = playersDF[playersCols]
playersDF.head()

Unnamed: 0,playerID,birthYear,nameFirst,nameLast,debut,finalGame
0,aardsda01,1981.0,David,Aardsma,2004-04-06,2015-08-23
1,aaronha01,1934.0,Hank,Aaron,1954-04-13,1976-10-03
2,aaronto01,1939.0,Tommie,Aaron,1962-04-10,1971-09-26
3,aasedo01,1954.0,Don,Aase,1977-07-26,1990-10-03
4,abadan01,1972.0,Andy,Abad,2001-09-10,2006-04-13


In [7]:
# Read csv into a DataFrame
franchisesDF = pd.read_csv(pathFranchises)
# Trim off unwanted columns
franchisesCols = ["franchID", "franchName"]
franchisesDF = franchisesDF[franchisesCols]
franchisesDF.head()

Unnamed: 0,franchID,franchName
0,ALT,Altoona Mountain City
1,ANA,Los Angeles Angels of Anaheim
2,ARI,Arizona Diamondbacks
3,ATH,Philadelphia Athletics
4,ATL,Atlanta Braves


In [8]:
# Read csv into a DataFrame
salariesDF = pd.read_csv(pathSalaries)
# Trim off unwanted columns
salariesCols = ["yearID", "teamID", "playerID", "salary"]
salariesDF = salariesDF[salariesCols]
salariesDF.head()

Unnamed: 0,yearID,teamID,playerID,salary
0,1985,ATL,barkele01,870000
1,1985,ATL,bedrost01,550000
2,1985,ATL,benedbr01,545000
3,1985,ATL,campri01,633333
4,1985,ATL,ceronri01,625000


<h1>===========================================================</h1>

<h1>Set up SQL DataBase</h1>

Before running of the cells below- <b>
    in pgAdmin: create a DataBase named 'baseballDB'

<h4>Create a connection to SQL database</h4>

In [12]:
pg_user = 'postgres'
pg_password = password
db_name = 'baseballDB'

connection_string = f"{pg_user}:{pg_password}@localhost:5432/{db_name}"
engine = create_engine(f'postgresql://{connection_string}')

In [20]:
engine.table_names()

['teams', 'batting', 'pitching', 'players', 'franchises', 'salaries']

<h4>Add dataframes to SQL database</h4>

In [14]:
teamsDF.to_sql(name = "teams", con = engine, if_exists='append', index=False)

In [15]:
battingDF.to_sql(name = "batting", con = engine, if_exists='append', index=False)

In [16]:
pitchingDF.to_sql(name = "pitching", con = engine, if_exists='append', index=False)

In [17]:
playersDF.to_sql(name = "players", con = engine, if_exists='append', index=False)

In [18]:
franchisesDF.to_sql(name = "franchises", con = engine, if_exists='append', index=False)

In [19]:
salariesDF.to_sql(name = "salaries", con = engine, if_exists='append', index=False)

In [21]:
#confirm data has been loaded to sql table
pd.read_sql_query('select * from franchises', con=engine).head()

Unnamed: 0,franchID,franchName
0,ALT,Altoona Mountain City
1,ANA,Los Angeles Angels of Anaheim
2,ARI,Arizona Diamondbacks
3,ATH,Philadelphia Athletics
4,ATL,Atlanta Braves


In [22]:
#confirm data has been loaded to sql table
pd.read_sql_query('select * from salaries', con=engine).head()

Unnamed: 0,yearID,teamID,playerID,salary
0,1985,ATL,barkele01,870000
1,1985,ATL,bedrost01,550000
2,1985,ATL,benedbr01,545000
3,1985,ATL,campri01,633333
4,1985,ATL,ceronri01,625000


In [23]:
#confirm data has been loaded to sql table
pd.read_sql_query('select * from players', con=engine).head()

Unnamed: 0,playerID,birthYear,nameFirst,nameLast,debut,finalGame
0,aardsda01,1981.0,David,Aardsma,2004-04-06,2015-08-23
1,aaronha01,1934.0,Hank,Aaron,1954-04-13,1976-10-03
2,aaronto01,1939.0,Tommie,Aaron,1962-04-10,1971-09-26
3,aasedo01,1954.0,Don,Aase,1977-07-26,1990-10-03
4,abadan01,1972.0,Andy,Abad,2001-09-10,2006-04-13


In [24]:
#confirm data has been loaded to sql table
pd.read_sql_query('select * from pitching', con=engine).head()

Unnamed: 0,playerID,yearID,teamID,stint,G,H,HR,BB,SO,ERA,R
0,bechtge01,1871,PH1,1,3,43,0,11,1,7.96,42
1,brainas01,1871,WS3,1,30,361,4,37,13,4.5,292
2,fergubo01,1871,NY2,1,1,8,0,0,0,27.0,9
3,fishech01,1871,RC1,1,24,295,3,31,15,4.35,257
4,fleetfr01,1871,NY2,1,1,20,0,3,0,10.0,21


In [25]:
#confirm data has been loaded to sql table
pd.read_sql_query('select * from batting', con=engine).head()

Unnamed: 0,playerID,yearID,teamID,stint,G,R,H,HR,BB,IBB,SO,SB
0,abercda01,1871,TRO,1,1,0,0,0,0,,0.0,0.0
1,addybo01,1871,RC1,1,25,30,32,0,4,,0.0,8.0
2,allisar01,1871,CL1,1,29,28,40,0,2,,5.0,3.0
3,allisdo01,1871,WS3,1,27,28,44,2,0,,2.0,1.0
4,ansonca01,1871,RC1,1,25,29,39,0,2,,1.0,6.0


In [27]:
#confirm data has been loaded to sql table
pd.read_sql_query('select * from teams', con=engine).head()

Unnamed: 0,yearID,teamID,franchID,G,W,R,H,HR,BB,SO,SB,RA,ERA,HA,HRA,BBA,SOA
0,1871,BS1,BNA,31,20,401,426,3,60.0,19.0,73.0,303,3.55,367,2,42,23
1,1871,CH1,CNA,28,19,302,323,10,60.0,22.0,69.0,241,2.76,308,6,28,22
2,1871,CL1,CFC,29,10,249,328,7,26.0,25.0,18.0,341,4.11,346,13,53,34
3,1871,FW1,KEK,19,7,137,178,2,33.0,9.0,16.0,243,5.17,261,5,21,17
4,1871,NY2,NNA,33,16,302,403,1,33.0,15.0,46.0,313,3.72,373,7,42,22
