# Scraping Data from https://en.wikipedia.org/wiki

In [54]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
import re
from datetime import datetime

create_data_here = '/Users/anirudhkrishna/GitHub/FormulaData/csv-data/'

In [55]:
race_list = pd.read_csv(create_data_here +'race_list.csv')

In [56]:
country_to_GP = {
    'Abu Dhabi': 'Abu_Dhabi_Grand_Prix',
    'Argentina': 'Argentine_Grand_Prix',
    'Australia': 'Australian_Grand_Prix',
    'Austria': 'Austrian_Grand_Prix',
    'Azerbaijan': 'Azerbaijan_Grand_Prix',
    'Bahrain': 'Bahrain_Grand_Prix',
    'Belgium': 'Belgian_Grand_Prix',
    'Brazil': 'Brazilian_Grand_Prix',
    'Canada': 'Canadian_Grand_Prix',
    'China': 'Chinese_Grand_Prix',
    'Emilia Romagna': 'Emilia_Romagna_Grand_Prix',
    'France': 'French_Grand_Prix',
    'Germany': 'German_Grand_Prix',
    'Great Britain': 'British_Grand_Prix',
    'Hungary': 'Hungarian_Grand_Prix',
    'Imola': 'Emilia_Romagna_Grand_Prix',
    'India': 'Indian_Grand_Prix',
    'Indianapolis 500' : 'Indianapolis_500',
    'Italy': 'Italian_Grand_Prix',
    'Japan': 'Japanese_Grand_Prix',
    'Korea': 'Korean_Grand_Prix',
    'Malaysia': 'Malaysian_Grand_Prix',
    'Mexico': 'Mexican_Grand_Prix',
    'Miami': 'Miami_Grand_Prix',
    'Monaco': 'Monaco_Grand_Prix',
    'Morocco': 'Moroccan_Grand_Prix',
    'Netherlands': 'Dutch_Grand_Prix',
    'Portugal': 'Portuguese_Grand_Prix',
    'Qatar': 'Qatar_Grand_Prix',
    'Russia': 'Russian_Grand_Prix',
    'San Marino': 'San_Marino_Grand_Prix',
    'Saudi Arabia': 'Saudi_Arabian_Grand_Prix',
    'Singapore': 'Singapore_Grand_Prix',
    'South Africa': 'South_African_Grand_Prix',
    'Styria': 'Styrian_Grand_Prix',
    'Spain': 'Spanish_Grand_Prix',
    'Sweden': 'Swedish_Grand_Prix',
    'Switzerland': 'Swiss_Grand_Prix',
    'Turkey': 'Turkish_Grand_Prix',
    'United Arab Emirates': 'Abu_Dhabi_Grand_Prix',
    'United Kingdom': 'British_Grand_Prix',
    'United States': 'United_States_Grand_Prix',
    '70th Anniversary': 'British_Grand_Prix'
}

In [57]:
wet = ['rain', 'wet', 'showers', 'damp', 'thunderstorms', 'rainy', 'moist', 'dewy', 'drizzly', 'muggy', 'clammy', 'soggy', 'drenched']
cloudy = ['cloudy', 'overcast', 'gray', 'gloomy', 'dull', 'hazy', 'humid', 'misty', 'foggy', 'smoky', 'nebulous', 'obscured', 'shadowy', 'cool']
dry = ['sunny', 'dry', 'clear', 'bright', 'radiant', 'sunlit', 'sunshine', 'shining', 'warm', 'hot', 'blazing', 'scorching']



def parse_weather_info(weather_string):
    weather_words = re.split(r'[^a-zA-Z]', weather_string)
    for i, word in enumerate(weather_words):
        weather_words[i] = word.lower()
    for word in wet:
        if word in weather_words:
            return 'wet'
    for word in cloudy:
        if word in weather_words:
            return 'cloudy'
    for word in dry:
        if word in weather_words:
            return 'dry'
    return 'dry'
        

def parse_date(date_string):
    modified_string = re.sub(r'[^a-zA-Z0-9]', ' ', date_string)
    modified_string = re.sub(r'\s+', ' ', modified_string)
    date_words = modified_string.split()

    if len(date_words)!=3:
        return None

    if date_words[1].isdigit() and not date_words[0].isdigit():
        temp = date_words[0]
        date_words[0] = date_words[1]
        date_words[1] = temp
    date = ' '.join(date_words[:2])+', '+date_words[2]
    return date


# def parse_date(date_string):
#     try:
#         date = datetime.strptime(date_string, '%B %d, %Y')
#         return date.strftime('%B %d, %Y')
#     except ValueError:
#         return None
    

In [58]:
wiki_links = []

for index, row in race_list.iterrows():
    try:
        GP_string = country_to_GP[row['location']]
    except:
        GP_string = row['location'] + '_Grand_Prix'

    wiki_link = 'https://en.wikipedia.org/wiki/' + str(row['season']) + '_' + GP_string
    wiki_links.append(wiki_link)

race_list['wiki_link'] = wiki_links
    

In [59]:
print(race_list.shape)

(1089, 4)


In [60]:
weather_info = []
dates = []
course_lengths = []
circuit_names = []

for link in wiki_links:
    try:
        for table_index in range(4):
            df = pd.read_html(link)[table_index]
            if 'Date' in df.iloc[:, 0].values:
                # print(df.iloc[6,1])
                try:
                    row_index_weather = df.iloc[:, 0].values.tolist().index('Weather')
                    weather = parse_weather_info(df.iloc[row_index_weather, 1])
                    weather_info.append(weather)
                except:
                    weather_info.append(None)
                try:
                    row_index_circuit_name = df.iloc[:, 0].values.tolist().index('Location')
                    circuit_name = df.iloc[row_index_circuit_name, 1]
                    circuit_names.append(circuit_name)
                except:
                    circuit_names.append(None)
                try:
                    row_index_dates = df.iloc[:, 0].values.tolist().index('Date')
                    date = parse_date(df.iloc[row_index_dates, 1])
                    dates.append(date)
                except:
                    dates.append(None)
                try:
                    row_index_course_lengths = df.iloc[:, 0].values.tolist().index('Course length')
                    course_lengths.append(df.iloc[row_index_course_lengths, 1])
                except:
                    course_lengths.append(None)
                break
    except:
        weather_info.append(None)
        dates.append(None)
        course_lengths.append(None)
        circuit_names.append(None)



In [105]:
# circuit_names.insert(510, "6.940 km (4.312 miles)")
# course_lengths.insert(510, "Circuit de Spa-Francorchamps, Francorchamps, Wallonia, Belgium")
# dates.insert(510, "25 August, 1991")
# weather_info.insert(510, "dry")

# print(circuit_names[510])
# print(len(weather_info))
# print(race_list.iloc[510])


Circuit de Spa-Francorchamps, Francorchamps, Wallonia, Belgium
1089
season                                                    1991
round                                                       11
location                                               Belgium
wiki_link    https://en.wikipedia.org/wiki/1991_Belgian_Gra...
Name: 510, dtype: object


In [106]:
race_list['circuit_length'] = course_lengths
race_list['date'] = dates
race_list['weather'] = weather_info
race_list['circuit_full_name'] = circuit_names

In [107]:
print(race_list)

      season  round          location  \
0       1950      1     Great Britain   
1       1950      2            Monaco   
2       1950      3  Indianapolis 500   
3       1950      4       Switzerland   
4       1950      5           Belgium   
...      ...    ...               ...   
1084    2023      6            Monaco   
1085    2023      7             Spain   
1086    2023      8            Canada   
1087    2023      9           Austria   
1088    2023     10     Great Britain   

                                              wiki_link  \
0     https://en.wikipedia.org/wiki/1950_British_Gra...   
1     https://en.wikipedia.org/wiki/1950_Monaco_Gran...   
2     https://en.wikipedia.org/wiki/1950_Indianapoli...   
3     https://en.wikipedia.org/wiki/1950_Swiss_Grand...   
4     https://en.wikipedia.org/wiki/1950_Belgian_Gra...   
...                                                 ...   
1084  https://en.wikipedia.org/wiki/2023_Monaco_Gran...   
1085  https://en.wikipedia.org/wiki

In [108]:
race_list.to_csv(create_data_here +'final_race_list.csv', index=False)