## Data Cleaning

### License:
Using the csv files goals.csv & tournament_standings.csv                                                                               ("Joshua C. Fjelstul, Ph.D."), a notice that the database is copyrighted ("© 2022 Joshua C. Fjelstul, Ph.D."),                                                                                                                                                    a link to the CC-BY-SA 4.0 license (https://creativecommons.org/licenses/by-sa/4.0/legalcode),                          and a link to this repository (https://www.github.com/jfjelstul/worldcup)


In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import inspect
from sqlalchemy import text

### Store CSV into DataFrame

In [2]:
goals = "csv/goals.csv"
goals_df = pd.read_csv(goals)
goals_df.head()

Unnamed: 0,key_id,goal_id,tournament_id,tournament_name,match_id,match_name,match_date,stage_name,group_name,team_id,...,shirt_number,player_team_id,player_team_name,player_team_code,minute_label,minute_regulation,minute_stoppage,match_period,own_goal,penalty
0,1,G-0001,WC-1930,1930 FIFA World Cup,M-1930-01,France v Mexico,1930-07-13,group stage,Group 1,T-28,...,0,T-28,France,FRA,19',19,0,first half,0,0
1,2,G-0002,WC-1930,1930 FIFA World Cup,M-1930-01,France v Mexico,1930-07-13,group stage,Group 1,T-28,...,0,T-28,France,FRA,40',40,0,first half,0,0
2,3,G-0003,WC-1930,1930 FIFA World Cup,M-1930-01,France v Mexico,1930-07-13,group stage,Group 1,T-28,...,0,T-28,France,FRA,43',43,0,first half,0,0
3,4,G-0004,WC-1930,1930 FIFA World Cup,M-1930-01,France v Mexico,1930-07-13,group stage,Group 1,T-28,...,0,T-28,France,FRA,87',87,0,second half,0,0
4,5,G-0005,WC-1930,1930 FIFA World Cup,M-1930-01,France v Mexico,1930-07-13,group stage,Group 1,T-44,...,0,T-44,Mexico,MEX,70',70,0,second half,0,0


In [3]:
tournament_standings = "csv/tournament_standings.csv"
tournament_standings_df = pd.read_csv(tournament_standings)
tournament_standings_df.head()

Unnamed: 0,key_id,tournament_id,tournament_name,position,team_id,team_name,team_code
0,1,WC-1930,1930 FIFA World Cup,1,T-80,Uruguay,URY
1,2,WC-1930,1930 FIFA World Cup,2,T-03,Argentina,ARG
2,3,WC-1930,1930 FIFA World Cup,3,T-79,United States,USA
3,4,WC-1930,1930 FIFA World Cup,4,T-83,Yugoslavia,YUG
4,5,WC-1934,1934 FIFA World Cup,1,T-39,Italy,ITA


### Create new data with select columns

In [4]:
new_goals_df = goals_df.drop(['match_id','stage_name','group_name','shirt_number','player_team_id','player_team_code','minute_label','minute_stoppage','own_goal','penalty'], axis=1)
new_goals_df.head()

Unnamed: 0,key_id,goal_id,tournament_id,tournament_name,match_name,match_date,team_id,team_name,team_code,home_team,away_team,player_id,family_name,given_name,player_team_name,minute_regulation,match_period
0,1,G-0001,WC-1930,1930 FIFA World Cup,France v Mexico,1930-07-13,T-28,France,FRA,1,0,P-09831,Laurent,Lucien,France,19,first half
1,2,G-0002,WC-1930,1930 FIFA World Cup,France v Mexico,1930-07-13,T-28,France,FRA,1,0,P-05670,Langiller,Marcel,France,40,first half
2,3,G-0003,WC-1930,1930 FIFA World Cup,France v Mexico,1930-07-13,T-28,France,FRA,1,0,P-07295,Maschinot,André,France,43,first half
3,4,G-0004,WC-1930,1930 FIFA World Cup,France v Mexico,1930-07-13,T-28,France,FRA,1,0,P-07295,Maschinot,André,France,87,second half
4,5,G-0005,WC-1930,1930 FIFA World Cup,France v Mexico,1930-07-13,T-44,Mexico,MEX,0,1,P-03952,Carreño,Juan,Mexico,70,second half


### Store new DataFrame into CSV file

In [5]:
new_goals_df.to_csv("csv/new_goals.csv", index=False)

### Connect to local database

In [6]:
protocol = 'postgresql'
username = 'postgres'
password = '2090'
host = 'localhost'
port = 5432
database_name = 'project_3'
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)
insp = inspect(engine)

### Check for tables

In [7]:
insp.get_table_names()

['goals', 'tournament_standings']

### Use pandas to load csv converted DataFrame into database

In [8]:
new_goals_df.to_sql(name='goals', con=engine, if_exists='append', index=False)

In [9]:
tournament_standings_df.to_sql(name='tournament_standings', con=engine, if_exists='append', index=False)

### Confirm data has been added by querying the table name

In [10]:
pd.read_sql_query('select * from goals', con=engine).head()

Unnamed: 0,key_id,goal_id,tournament_id,tournament_name,match_name,match_date,team_id,team_name,team_code,home_team,away_team,player_id,family_name,given_name,player_team_name,minute_regulation,match_period
0,1,G-0001,WC-1930,1930 FIFA World Cup,France v Mexico,1930-07-13,T-28,France,FRA,1,0,P-09831,Laurent,Lucien,France,19,first half
1,2,G-0002,WC-1930,1930 FIFA World Cup,France v Mexico,1930-07-13,T-28,France,FRA,1,0,P-05670,Langiller,Marcel,France,40,first half
2,3,G-0003,WC-1930,1930 FIFA World Cup,France v Mexico,1930-07-13,T-28,France,FRA,1,0,P-07295,Maschinot,André,France,43,first half
3,4,G-0004,WC-1930,1930 FIFA World Cup,France v Mexico,1930-07-13,T-28,France,FRA,1,0,P-07295,Maschinot,André,France,87,second half
4,5,G-0005,WC-1930,1930 FIFA World Cup,France v Mexico,1930-07-13,T-44,Mexico,MEX,0,1,P-03952,Carreño,Juan,Mexico,70,second half


In [11]:
pd.read_sql_query('select * from tournament_standings', con=engine).head()

Unnamed: 0,key_id,tournament_id,tournament_name,position,team_id,team_name,team_code
0,1,WC-1930,1930 FIFA World Cup,1,T-80,Uruguay,URY
1,2,WC-1930,1930 FIFA World Cup,2,T-03,Argentina,ARG
2,3,WC-1930,1930 FIFA World Cup,3,T-79,United States,USA
3,4,WC-1930,1930 FIFA World Cup,4,T-83,Yugoslavia,YUG
4,5,WC-1934,1934 FIFA World Cup,1,T-39,Italy,ITA
