In [1]:
import pandas as pd
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect,Integer

# Create an engine that can talk to the database
engine = create_engine("sqlite:///./Resources/sqlite/Formula1_4tables.sqlite")

# Explore Database
inspector = inspect(engine)
print("Working with the following tables:  ")
print(inspector.get_table_names())
print()

# Create a function that will help with creating data frame for each table
def create_df(data, table_name):
    columns = inspector.get_columns(table_name)
    # Get the column names and types
    # List to hold the names for columns
    col_names = []
    for c in columns:
        # print(c["name"],c["type"])
        col_names.append(c["name"])
    df = pd.DataFrame(data, columns=col_names)
    return df
    

# Query the info for circuits, drivers, races and results
circuit_data = engine.execute("SELECT * FROM circuits")
drivers_data = engine.execute("SELECT * FROM drivers")
races_data = engine.execute("SELECT * FROM races")
results_data = engine.execute("SELECT * FROM results")

# Query All Records in the the Database and make a DF out of the table
circuit_df = create_df(circuit_data, "circuits")
drivers_df = create_df(drivers_data, "drivers")
races_df = create_df(races_data, "races")
results_df = create_df(results_data, "results")
results_df.head()

Working with the following tables:  
['circuits', 'drivers', 'races', 'results']



Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,34:50.6,5690616,39,2,01:27.5,218.3,1
1,2,18,2,2,3,5,2,2,2,8.0,58,5.478,5696094,41,3,01:27.7,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,8.163,5698779,41,5,01:28.1,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,17.181,5707797,58,7,01:28.6,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,18.014,5708630,43,1,01:27.4,218.385,1


In [10]:
# The goal of the project is to look at the top 3 best drivers for each year starting from 2014 to 2017
# we need to clean the data based on this goal
# clean circuits data
new_circ_df = circuit_df[["circuitId","name","country"]]

#clean drivers data
new_drivers_df = drivers_df[["driverId","forename","surname","nationality"]]

#clean race data
new_race_df = races_df[["raceId","year","circuitId","name"]]

#clean results data
new_results_df = results_df[["raceId","driverId","constructorId","points","positionOrder"]]


In [11]:
# Merge the results and driver data
driver_result_df = pd.merge(new_drivers_df, new_results_df, on="driverId")
driver_result_df
# Merge the circuits and races data
circ_race_df = pd.merge(new_circ_df, new_race_df,  on="circuitId")


In [12]:
circ_race_df.head()

Unnamed: 0,circuitId,name_x,country,raceId,year,name_y
0,1,Albert Park Grand Prix Circuit,Australia,1,2009,Australian Grand Prix
1,1,Albert Park Grand Prix Circuit,Australia,18,2008,Australian Grand Prix
2,1,Albert Park Grand Prix Circuit,Australia,36,2007,Australian Grand Prix
3,1,Albert Park Grand Prix Circuit,Australia,55,2006,Australian Grand Prix
4,1,Albert Park Grand Prix Circuit,Australia,71,2005,Australian Grand Prix


In [13]:
# Droping one column that repeats
# the columns are the same, but one column has more detailed name, when merging it creates two in stead of one of this
circ_race_df = circ_race_df.drop(columns=["name_y"])
circ_race_df = circ_race_df.rename(columns={"name_x": "name"})
circ_race_df

Unnamed: 0,circuitId,name,country,raceId,year
0,1,Albert Park Grand Prix Circuit,Australia,1,2009
1,1,Albert Park Grand Prix Circuit,Australia,18,2008
2,1,Albert Park Grand Prix Circuit,Australia,36,2007
3,1,Albert Park Grand Prix Circuit,Australia,55,2006
4,1,Albert Park Grand Prix Circuit,Australia,71,2005
...,...,...,...,...,...
992,71,Sochi Autodrom,Russia,972,2017
993,71,Sochi Autodrom,Russia,1004,2018
994,73,Baku City Circuit,Azerbaijan,955,2016
995,73,Baku City Circuit,Azerbaijan,976,2017


In [46]:
# Merge all data together
# We are only intersted in the last 4 years
formula1_df = pd.merge(driver_result_df, circ_race_df, on="raceId")
formula1_2014 = formula1_df[formula1_df["year"]==2014]
formula1_2015 = formula1_df[formula1_df["year"]==2015]
formula1_2016 = formula1_df[formula1_df["year"]==2016]
formula1_2017 = formula1_df[formula1_df["year"]==2017]

# Group by driver ID, and take the the surname, forname and nationality
# take the sum of drivers points of that year
# and then after sorting them, take the first 3 places
points_place_2014 = formula1_2014.groupby(["year","driverId","constructorId","forename","surname","nationality"]).sum()["points"]
points_place_2014  = points_place_2014.sort_values(ascending = False)[0:3] 
points_place_2015 = formula1_2015.groupby(["year","driverId","constructorId","forename","surname","nationality"]).sum()["points"]
points_place_2015  = points_place_2015.sort_values(ascending = False)[0:3]
points_place_2016 = formula1_2016.groupby(["year","driverId","constructorId","forename","surname","nationality"]).sum()["points"]
points_place_2016  = points_place_2016.sort_values(ascending = False)[0:3] 
points_place_2017 = formula1_2017.groupby(["year","driverId","constructorId","forename","surname","nationality"]).sum()["points"]
points_place_2017  = points_place_2017.sort_values(ascending = False)[0:3] 

# Convert them to frame
points_place_2014 = points_place_2014.to_frame()
points_place_2015 = points_place_2015.to_frame()
points_place_2016 = points_place_2016.to_frame()
points_place_2017 = points_place_2017.to_frame()

# Put the final results in one tabel
f1_df = pd.DataFrame(points_place_2014)

#Append the rest to this DF
f1_df = f1_df.append(points_place_2015)
f1_df = f1_df.append(points_place_2016)
f1_df = f1_df.append(points_place_2017)
f1_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,points
year,driverId,constructorId,forename,surname,nationality,Unnamed: 6_level_1
2014,1,131,Lewis,Hamilton,British,384.0
2014,3,131,Nico,Rosberg,German,317.0
2014,817,9,Daniel,Ricciardo,Australian,238.0
2015,1,131,Lewis,Hamilton,British,381.0
2015,3,131,Nico,Rosberg,German,322.0
2015,20,6,Sebastian,Vettel,German,278.0
2016,3,131,Nico,Rosberg,German,385.0
2016,1,131,Lewis,Hamilton,British,380.0
2016,817,9,Daniel,Ricciardo,Australian,256.0
2017,1,131,Lewis,Hamilton,British,363.0


Import csv data

In [55]:
# Import data and convert it into DF
constr_1_csv = "./Resources/csv/constructor_results.csv"
constr_2_csv = "./Resources/csv/constructors.csv"
constr_3_csv = "./Resources/csv/races.csv"
constr_1_df = pd.read_csv(constr_1_csv)
constr_2_df = pd.read_csv(constr_2_csv)
constr_3_df = pd.read_csv(constr_3_csv)

# Merge the data in one
pre_contr_df = pd.merge(constr_1_df, constr_2_df, on="constructorId")
final_contr_df = pd.merge(pre_contr_df, constr_3_df, on="raceId")

# Filter only the needed columns
final_c1_df = final_contr_df[["constructorId","points","name_x","nationality","year"]]
final_c1_df = final_c1_df.rename(columns={"name_x": "constr_name"})

# The goal of the project is to look at the top 3 best drivers for each year starting from 2014 to 2017
c1_2014 = final_c1_df[final_c1_df["year"]==2014]
c1_2015 = final_c1_df[final_c1_df["year"]==2015]
c1_2016 = final_c1_df[final_c1_df["year"]==2016]
c1_2017 = final_c1_df[final_c1_df["year"]==2017]
c1_2017

Unnamed: 0,constructorId,points,constr_name,nationality,year
9818,1,0.0,McLaren,British,2017
9819,3,8.0,Williams,British,2017
9820,4,0.0,Renault,French,2017
9821,5,6.0,Toro Rosso,Italian,2017
9822,6,37.0,Ferrari,Italian,2017
...,...,...,...,...,...
10013,9,10.0,Red Bull,Austrian,2017
10014,10,10.0,Force India,Indian,2017
10015,15,0.0,Sauber,Swiss,2017
10016,131,43.0,Mercedes,German,2017


In [59]:
# Group by year
# take the sum of constructors points of that year
# and then after sorting them, take the first 3 places
pnts_place_2014 = c1_2014.groupby(["year","constructorId","constr_name","nationality"]).sum()["points"]
pnts_place_2014  = pnts_place_2014.sort_values(ascending = False)[0:3] 
pnts_place_2015 = c1_2015.groupby(["year","constructorId","constr_name","nationality"]).sum()["points"]
pnts_place_2015  = pnts_place_2015.sort_values(ascending = False)[0:3] 
pnts_place_2016 = c1_2016.groupby(["year","constructorId","constr_name","nationality"]).sum()["points"]
pnts_place_2016  = pnts_place_2016.sort_values(ascending = False)[0:3] 
pnts_place_2017 = c1_2017.groupby(["year","constructorId","constr_name","nationality"]).sum()["points"]
pnts_place_2017  = pnts_place_2017.sort_values(ascending = False)[0:3] 

# Convert them to frame
pnts_place_2014 = pnts_place_2014.to_frame()
pnts_place_2015 = pnts_place_2015.to_frame()
pnts_place_2016 = pnts_place_2016.to_frame()
pnts_place_2017 = pnts_place_2017.to_frame()

# Put the final results in one tabel
c1_df = pd.DataFrame(pnts_place_2014)

#Append the rest to this DF
c1_df = c1_df.append(pnts_place_2015)
c1_df = c1_df.append(pnts_place_2016)
c1_df = c1_df.append(pnts_place_2017)
c1_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,points
year,constructorId,constr_name,nationality,Unnamed: 4_level_1
2014,131,Mercedes,German,701.0
2014,9,Red Bull,Austrian,405.0
2014,3,Williams,British,320.0
2015,131,Mercedes,German,703.0
2015,6,Ferrari,Italian,428.0
2015,3,Williams,British,257.0
2016,131,Mercedes,German,765.0
2016,9,Red Bull,Austrian,468.0
2016,6,Ferrari,Italian,398.0
2017,131,Mercedes,German,668.0
