# Test processing entities in the csv file

## Data preprocessing

In [1]:
import pandas as pd

In [43]:
df = pd.read_csv('../procesed_data/entities.csv', sep=';')
df = df.dropna()
df['POS'].replace('-', -1, inplace=True)
df['POS'] = df['POS'].astype(int)
df['LAPS'] = df['LAPS'].astype(int)
df['GRID'] = df['GRID'].astype(int)

In [44]:
df.head()

Unnamed: 0,YEAR,GP NAME,POS,DRIVER NAME,TEAM NAME,LAP TIME,GRID,LAPS,POINTS
0,1973,italian-grand-prix,1,#2 Ronnie Peterson,Team Lotus,1:29:17.0,1,55,9.0
1,1973,italian-grand-prix,2,#1 Emerson Fittipaldi,Team Lotus,+0.8,4,55,6.0
2,1973,italian-grand-prix,3,#8 Peter Revson,McLaren,+28.8,2,55,4.0
3,1973,italian-grand-prix,4,#5 Jackie Stewart,Tyrrell,+33.2,6,55,3.0
4,1973,italian-grand-prix,5,#6 François Cevert,Tyrrell,+46.2,11,55,2.0


In [45]:
df.to_csv('../procesed_data/df_entities.csv', sep=';', index=False)

In [46]:
df = pd.read_csv('../procesed_data/df_entities.csv', sep=';')

## Data queries
Ideas what to look for:
- in what races have to drivers met
    - find all rows with specified drivers and return those with matching year and gp name
- look for a specific grand prix and return driver with the most wins at that grand prix
- find a driver with the most ammount of __second__ places when starting from __first__
- find if and when were two pilots collegues

In [67]:
pilot1 = 'leclerc'
pilot2 = 'Lewis Hamilton'
year = 2019

def find_pairs(df, p1, p2):
    filtered_p1 = df[df['DRIVER NAME'].str.contains(p1, case=False)]
    filtered_p2 = df[df['DRIVER NAME'].str.contains(p2, case=False)]
    
    filtered_p1 = filtered_p1[filtered_p1['YEAR'] == year]
    filtered_p2 = filtered_p2[filtered_p2['YEAR'] == year]

    pairs = filtered_p1.merge(filtered_p2, on=['YEAR', 'GP NAME'])
    return pairs[['YEAR', 'GP NAME']].head()


In [77]:
print(f'{pilot1} and {pilot2} raced together in:')
print(find_pairs(df, pilot1, pilot2))

leclerc and Lewis Hamilton raced together in:
   YEAR                   GP NAME
0  2019        russian-grand-prix
1  2019  united-states-grand-prix
2  2019         monaco-grand-prix
3  2019      hungarian-grand-prix
4  2019         french-grand-prix
None


In [92]:
gp = 'Japan'

def find_most_wins(df, gp_name, group_by_tag='DRIVER NAME'):
    filtered = None
    try:
        filtered = df[(df['GP NAME'].str.contains(gp_name, case=False)) & (df['POS'] == 1)]
        filtered = filtered.groupby([group_by_tag]).size().reset_index(name='counts')
        filtered = filtered.sort_values(by=['counts'], ascending=False)
        return filtered.head()
    except KeyError:
        print('Wrong group_by_tag provided')
        return 'No drivers found'
    

In [93]:
print(find_most_wins(df, gp, 'GUGU'))

Wrong group_by_tag provided
No drivers found


In [103]:
pilot1 = 'Bottas'
pilot2 = 'Lewis Hamilton'

def find_collegues(df, p1, p2):
    filtered_p1 = df[df['DRIVER NAME'].str.contains(p1, case=False)]
    filtered_p2 = df[df['DRIVER NAME'].str.contains(p2, case=False)]

    pairs = filtered_p1.merge(filtered_p2, on=['YEAR', 'TEAM NAME'])
    return pairs.drop_duplicates(subset=['YEAR', 'TEAM NAME'])[['YEAR', 'TEAM NAME']].sort_values(by=['YEAR'])

In [104]:
print(find_collegues(df, pilot1, pilot2))

     YEAR TEAM NAME
0    2017  Mercedes
701  2018  Mercedes
289  2019  Mercedes
545  2020  Mercedes
897  2021  Mercedes
