In [137]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz

link = '/Users/anirudhkrishna/GitHub/FormulaData/csv-data/cleaned_race_data.csv'

In [138]:
f1_data = pd.read_csv(link)

In [139]:
unique_locations = f1_data['location'].unique()
print(unique_locations)
f1_data['location'].replace('70th Anniversary', 'Great Britain', inplace=True)
f1_data['location'].replace('Emilia Romagna', 'San Marino', inplace=True)
f1_data['location'].replace('Styria', 'Austria', inplace=True)

['Great Britain' 'Monaco' 'Indianapolis 500' 'Switzerland' 'Belgium'
 'France' 'Italy' 'Germany' 'Spain' 'Netherlands' 'Argentina' 'Pescara'
 'Portugal' 'Morocco' 'United States' 'South Africa' 'Mexico' 'Austria'
 'Canada' 'Brazil' 'Sweden' 'USA West' 'USA East' 'Japan' 'San Marino'
 'Las Vegas' 'Detroit' 'Europe' 'Dallas' 'Australia' 'Hungary' 'Pacific'
 'Luxembourg' 'Malaysia' 'Bahrain' 'China' 'Turkey' 'Singapore'
 'Abu Dhabi' 'South Korea' 'India' 'Russia' 'Azerbaijan' 'Styria'
 '70th Anniversary' 'Tuscany' 'Eifel' 'Emilia Romagna' 'Sakhir' 'Qatar'
 'Saudi Arabia' 'Miami']


In [140]:
one_hot_encoded = pd.get_dummies(f1_data['location'])
formatted_columns = ['location_' + column.lower().replace(' ', '_') for column in one_hot_encoded.columns]
one_hot_encoded.columns = formatted_columns
f1_data = pd.concat([f1_data, one_hot_encoded], axis=1)
f1_data = f1_data.drop('location', axis=1)

In [141]:
f1_data['weather'] = f1_data['weather'].fillna(f1_data['weather'].mode()[0])
one_hot_encoded = pd.get_dummies(f1_data['weather'])
formatted_columns = ['weather_' + column.lower().replace(' ', '_') for column in one_hot_encoded.columns]
one_hot_encoded.columns = formatted_columns
f1_data = pd.concat([f1_data, one_hot_encoded], axis=1)

In [142]:
replacement_map = {'dry': 0, 'cloudy': 1, 'wet': 2}
f1_data['weather'] = f1_data['weather'].replace(replacement_map)

In [143]:
print(f1_data['race_finishing_position'].unique())
f1_data['race_finishing_position'].replace('NC', '20', inplace=True)
f1_data['race_finishing_position'].replace('DQ', '20', inplace=True)
f1_data['race_finishing_position'].replace('EX', '20', inplace=True)
f1_data['race_finishing_position'] = f1_data['race_finishing_position'].astype(int)

['1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' 'NC' '31' '12' '13' '14'
 '15' '16' '17' '18' '19' '20' '21' '22' '23' '24' '25' '28' '26' '27'
 '29' '30' '32' '33' 'DQ' 'EX']


In [144]:
f1_data['net_positions_gained'] = f1_data['net_positions_gained'].fillna(0)
f1_data['net_positions_gained'] = f1_data['net_positions_gained'].astype(int)

In [145]:
f1_data['grid_position'].replace('NC', '20', inplace=True)
f1_data['grid_position'].replace('DQ', '20', inplace=True)
f1_data['grid_position'].replace('EX', '20', inplace=True)
f1_data['grid_position'] = f1_data['grid_position'].fillna(11)
f1_data['grid_position'] = f1_data['grid_position'].astype(int)

In [146]:
f1_data['fp1_position'].replace('NC', '20', inplace=True)
f1_data['fp1_position'].replace('DQ', '20', inplace=True)
f1_data['fp1_position'].replace('EX', '20', inplace=True)
f1_data['fp1_position'] = f1_data['fp1_position'].fillna(11)
f1_data['fp1_position'] = f1_data['fp1_position'].astype(int)

In [147]:
f1_data['fp2_position'].replace('NC', '20', inplace=True)
f1_data['fp2_position'].replace('DQ', '20', inplace=True)
f1_data['fp2_position'].replace('EX', '20', inplace=True)
f1_data['fp2_position'] = f1_data['fp2_position'].fillna(11)
f1_data['fp2_position'] = f1_data['fp2_position'].astype(int)

In [148]:
f1_data['fp3_position'].replace('NC', '20', inplace=True)
f1_data['fp3_position'].replace('DQ', '20', inplace=True)
f1_data['fp3_position'].replace('EX', '20', inplace=True)
f1_data['fp3_position'] = f1_data['fp3_position'].fillna(11)
f1_data['fp3_position'] = f1_data['fp3_position'].astype(int)

In [149]:
f1_data['fastest_lap_position'].replace('NC', '20', inplace=True)
f1_data['fastest_lap_position'].replace('DQ', '20', inplace=True)
f1_data['fastest_lap_position'].replace('EX', '20', inplace=True)
f1_data['fastest_lap_position'] = f1_data['fastest_lap_position'].fillna(11)
f1_data['fastest_lap_position'] = f1_data['fastest_lap_position'].astype(int)

In [150]:
def parse_circuit_length(length):
    try:
      return float(length[0:4])
    except:
       return float(5)

f1_data['circuit_length'] = f1_data['circuit_length'].apply(parse_circuit_length)

In [151]:
f1_data['latitude'] = f1_data['latitude'] .fillna(f1_data['latitude'] .mean())
f1_data['longitude'] = f1_data['longitude'] .fillna(f1_data['longitude'] .mean())

In [152]:
f1_data = f1_data.drop(['net_positions_gained','date', 'car_number', 'race_time', 'qualifying_time', 'fastest_lap_number', 'fastest_lap_time', 'circuit_full_name', 'wikipedia_link'], axis = 1)

In [153]:
print(f1_data.dtypes[0:20])

season                       int64
round                        int64
weather                      int64
race_finishing_position      int64
driver_name                 object
constructor_name            object
has_fastest_lap              int64
race_laps_completed          int64
points                       int64
grid_position                int64
fp1_position                 int64
fp2_position                 int64
fp3_position                 int64
fastest_lap_position         int64
circuit_length             float64
latitude                   float64
longitude                  float64
location_abu_dhabi            bool
location_argentina            bool
location_australia            bool
dtype: object


In [154]:
# print(len(f1_data['constructor_name'].unique()))
f1_data = f1_data.iloc[8578:]
print(f1_data.shape)

(15824, 69)


In [155]:
team_names = f1_data['constructor_name'].unique().tolist()

# Create an empty dictionary to store the mappings
team_name_mapping = {}

# Iterate over each team name
for i, name in enumerate(team_names):
    # Check if the team name is not already mapped
    if name not in team_name_mapping.values():
        # Iterate over the remaining team names
        for j in range(i+1, len(team_names)):
            # Calculate the similarity score between the current and remaining team names
            similarity_score = fuzz.ratio(name, team_names[j])
            # If the similarity score is above a certain threshold, consider them as the same team
            if similarity_score > 70:
                # Map the remaining team name to the original team name
                team_name_mapping[team_names[j]] = name

# Print the team name mappings
# for k, v in team_name_mapping.items():
#     print(f"{k} --> {v}")

print(team_name_mapping)


{'March Ford': 'Minardi Ford', 'McLaren Honda': 'McLaren Ford', 'McLaren Mercedes': 'Sauber Mercedes', 'Williams Honda': 'Williams Ford', 'Williams Judd': 'Williams Honda', 'Williams BMW': 'Williams Judd', 'Williams Cosworth': 'Williams Ford', 'Williams Toyota': 'Williams Cosworth', 'Williams Mercedes': 'Williams Supertec', 'Lola Ferrari': 'Dallara Ferrari', 'RBR Ferrari': 'Ferrari', 'STR Ferrari': 'RBR Ferrari', 'Haas Ferrari': 'Marussia Ferrari', 'Larrousse Ford': 'Lotus Ford', 'Arrows': 'Arrows BMW', 'RBR Renault': 'Red Bull Renault', 'STR Renault': 'RBR Renault', 'Lola Hart': 'Toleman Hart', 'Tyrrell Honda': 'Tyrrell Renault', 'Tyrrell Ilmor': 'Tyrrell Ford', 'Lola Ford': 'Lotus Ford', 'Lotus Honda': 'Lotus Ford', 'AGS Ford': 'RAM Ford', 'Rial Ford': 'RAM Ford', 'Ligier Judd': 'Ligier Ford', 'Osella Alfa Romeo': 'Alfa Romeo', 'Alfa Romeo Ferrari': 'Alfa Romeo Racing Ferrari', 'Osella': 'Osella Ford', 'Spirit Hart': 'Spirit Honda', 'Williams Renault': 'Williams Honda', 'Alpine Renau

In [156]:
team_name_mapping_improved = {'Alfa Romeo Racing Ferrari':'Alfa Romeo','March Ford': 'Ford', 'McLaren Honda': 'McLaren', 'McLaren Mercedes': 'McLaren', 'Williams Honda': 'Williams', 'Williams Judd': 'Williams', 'Williams BMW': 'Williams', 'Williams Cosworth': 'Williams', 'Williams Toyota': 'Williams', 'Williams': 'Williams', 'Lola Ferrari': 'Ferrari', 'RBR Ferrari': 'Red Bull Racing', 'STR Ferrari': 'Toro Rosso', 'Haas Ferrari': 'Haas', 'Larrousse Ford': 'Ford', 'Arrows': 'Arrows', 'RBR Renault': 'Red Bull Racing', 'STR Renault': 'Toro Rosso', 'Lola Hart': 'Hart', 'Tyrrell Honda': 'Tyrrell', 'Tyrrell Ilmor': 'Tyrrell', 'Lola Ford': 'Ford', 'Lotus Honda': 'Lotus', 'AGS Ford': 'Ford', 'Rial Ford': 'Ford', 'Ligier Judd': 'Ligier', 'Osella Alfa Romeo': 'Alfa Romeo', 'Alfa Romeo Ferrari': 'Alfa Romeo', 'Osella': 'Osella', 'Spirit Hart': 'Spirit', 'Williams Renault': 'Williams', 'Alpine Renault': 'Renault', 'Minardi Ferrari': 'Minardi', 'Minardi Hart': 'Minardi', 'Minardi Fondmetal': 'Minardi', 'Minardi Cosworth': 'Minardi', 'AGS Motori Moderni': 'Motori Moderni', 'Red Bull Renault': 'Red Bull Racing', 'Benetton Ford': 'Benetton', 'Lotus Mugen Honda': 'Lotus', 'March Ilmor': 'March', 'McLaren Renault': 'McLaren', 'Euro Brun Judd': 'Euro Brun', 'Lotus Lamborghini': 'Lamborghini', 'Lambo Lamborghini': 'Lamborghini', 'Ligier Lamborghini': 'Lamborghini', 'Minardi Lamborghini': 'Lamborghini', 'Larrousse Lamborghini': 'Lamborghini', 'Venturi Lamborghini': 'Lamborghini', 'Marussia Ferrari': 'Marussia',  'Jordan Honda': 'Jordan Honda', 'Prost Mugen Honda': 'Mugen Honda', 'Jordan Mugen Honda': 'Mugen Honda', 'Sauber Ford': 'Sauber', 'Sauber BMW': 'Sauber', 'Ligier Mugen Honda': 'Mugen Honda', 'Sauber Ferrari': 'Alfa Romeo', 'Honda': 'Honda', 'Jordan Toyota': 'Jordan', 'MF1 Toyota': 'Toyota', 'Red Bull Racing Renault': 'Red Bull Racing', 'Force India Mercedes': 'Aston Martin', 'Mercedes': 'Mercedes', 'Lotus Mercedes': 'Lotus', 'Red Bull Racing TAG Heuer': 'Red Bull Racing', 'Red Bull Racing Honda': 'Red Bull Racing', 'Red Bull Racing RBPT': 'Red Bull Racing', 'Toro Rosso': 'Toro Rosso', 'Red Bull Racing Honda RBPT': 'Red Bull Racing', 'AlphaTauri RBPT': 'Toro Rosso', 'AlphaTauri Honda RBPT': 'Toro Rosso', 'Aston Martin Aramco Mercedes': 'Aston Martin', 'Force India Ferrari' : 'Aston Martin'}

In [157]:
f1_data['constructor_name'] = f1_data['constructor_name'].replace(team_name_mapping_improved)

In [158]:
one_hot_encoded = pd.get_dummies(f1_data['constructor_name'])
formatted_columns = ['constructor_name_' + column.lower().replace(' ', '_') for column in one_hot_encoded.columns]
one_hot_encoded.columns = formatted_columns
f1_data = pd.concat([f1_data, one_hot_encoded], axis=1)
f1_data = f1_data.drop('constructor_name', axis=1)

In [159]:
one_hot_encoded = pd.get_dummies(f1_data['driver_name'])
formatted_columns = ['driver_name_' + column.lower().replace(' ', '_') for column in one_hot_encoded.columns]
one_hot_encoded.columns = formatted_columns
f1_data = pd.concat([f1_data, one_hot_encoded], axis=1)
f1_data = f1_data.drop('driver_name', axis=1)

In [160]:
boolean_columns = f1_data.select_dtypes(include=bool).columns
f1_data[boolean_columns] = f1_data[boolean_columns].astype(int)

In [161]:
f1_data.to_csv('/Users/anirudhkrishna/GitHub/FormulaData/data-modelling/f1_data.csv', index=False)