# Scraping Data from https://en.wikipedia.org/wiki

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
import re
from datetime import datetime

In [2]:
race_list = pd.read_csv('race_list.csv')

In [3]:
country_to_GP = {
    'Abu Dhabi': 'Abu_Dhabi_Grand_Prix',
    'Argentina': 'Argentine_Grand_Prix',
    'Australia': 'Australian_Grand_Prix',
    'Austria': 'Austrian_Grand_Prix',
    'Azerbaijan': 'Azerbaijan_Grand_Prix',
    'Bahrain': 'Bahrain_Grand_Prix',
    'Belgium': 'Belgian_Grand_Prix',
    'Brazil': 'Brazilian_Grand_Prix',
    'Canada': 'Canadian_Grand_Prix',
    'China': 'Chinese_Grand_Prix',
    'Emilia Romagna': 'Imola_Grand_Prix',
    'France': 'French_Grand_Prix',
    'Germany': 'German_Grand_Prix',
    'Great Britain': 'British_Grand_Prix',
    'Hungary': 'Hungarian_Grand_Prix',
    'India': 'Indian_Grand_Prix',
    'Italy': 'Italian_Grand_Prix',
    'Japan': 'Japanese_Grand_Prix',
    'Korea': 'Korean_Grand_Prix',
    'Malaysia': 'Malaysian_Grand_Prix',
    'Mexico': 'Mexican_Grand_Prix',
    'Miami': 'Miami_Grand_Prix',
    'Monaco': 'Monaco_Grand_Prix',
    'Morocco': 'Moroccan_Grand_Prix',
    'Netherlands': 'Dutch_Grand_Prix',
    'Portugal': 'Portuguese_Grand_Prix',
    'Qatar': 'Qatar_Grand_Prix',
    'Russia': 'Russian_Grand_Prix',
    'San Marino': 'San_Marino_Grand_Prix',
    'Saudi Arabia': 'Saudi_Arabian_Grand_Prix',
    'Singapore': 'Singapore_Grand_Prix',
    'South Africa': 'South_African_Grand_Prix',
    'Spain': 'Spanish_Grand_Prix',
    'Sweden': 'Swedish_Grand_Prix',
    'Switzerland': 'Swiss_Grand_Prix',
    'Turkey': 'Turkish_Grand_Prix',
    'United Arab Emirates': 'Abu_Dhabi_Grand_Prix',
    'United Kingdom': 'British_Grand_Prix',
    'United States': 'United_States_Grand_Prix'
}

In [4]:
wet = ['rain', 'wet', 'showers', 'damp', 'thunderstorms', 'rainy', 'moist', 'dewy', 'drizzly', 'muggy', 'clammy', 'soggy', 'drenched']
cloudy = ['cloudy', 'overcast', 'gray', 'gloomy', 'dull', 'hazy', 'humid', 'misty', 'foggy', 'smoky', 'nebulous', 'obscured', 'shadowy', 'cool']
sunny = ['sunny', 'dry', 'clear', 'bright', 'radiant', 'sunlit', 'sunshine', 'shining', 'warm', 'hot', 'blazing', 'scorching']



def parse_weather_info(weather_string):
    weather_words = re.split(r'[^a-zA-Z]', weather_string)
    for i, word in enumerate(weather_words):
        weather_words[i] = word.lower()
    for word in wet:
        if word in weather_words:
            return 'wet'
    for word in cloudy:
        if word in weather_words:
            return 'cloudy'
    for word in sunny:
        if word in weather_words:
            return 'sunny'
    return 'sunny'
        

def parse_date(date_string):
    modified_string = re.sub(r'[^a-zA-Z0-9]', ' ', date_string)
    modified_string = re.sub(r'\s+', ' ', modified_string)
    date_words = modified_string.split()

    if len(date_words)!=3:
        return None

    if date_words[1].isdigit() and not date_words[0].isdigit():
        temp = date_words[0]
        date_words[0] = date_words[1]
        date_words[1] = temp
    date = ' '.join(date_words[:2])+', '+date_words[2]
    return date


# def parse_date(date_string):
#     try:
#         date = datetime.strptime(date_string, '%B %d, %Y')
#         return date.strftime('%B %d, %Y')
#     except ValueError:
#         return None
    

In [5]:
wiki_links = []

for index, row in race_list.iterrows():
    wiki_link = 'https://en.wikipedia.org/wiki/' + str(row['season']) + '_' + country_to_GP[row['circuit']]
    wiki_links.append(wiki_link)

race_list['wiki_link'] = wiki_links
    

In [7]:
print(race_list.shape)

(32, 4)


In [20]:
weather_info = []
dates = []
course_lengths = []

for link in wiki_links:
    try:
        for table_index in range(4):
            df = pd.read_html(link)[table_index]
            if 'Date' in df.iloc[:, 0].values:
                print(df.iloc[6,1])
                try:
                    row_index_weather = df.iloc[:, 0].values.tolist().index('Weather')
                    weather = parse_weather_info(df.iloc[row_index_weather, 1])
                    weather_info.append(weather)
                except:
                    weather_info.append(None)
                try:
                    row_index_dates = df.iloc[:, 0].values.tolist().index('Date')
                    date = parse_date(df.iloc[row_index_dates, 1])
                    dates.append(date)
                except:
                    dates.append(None)
                try:
                    row_index_course_lengths = df.iloc[:, 0].values.tolist().index('Course length')
                    course_lengths.append(df.iloc[row_index_course_lengths, 1])
                except:
                    course_lengths.append(None)
                break
    except:
        weather_info.append(None)
        dates.append(None)
        course_lengths.append(None)



Bahrain International Circuit Sakhir, Bahrain
Jeddah Corniche Circuit, Jeddah, Saudi Arabia
Albert Park Circuit, Melbourne, Australia
Monza Circuit Monza, Italy
Miami International Autodrome, Miami Gardens, Florida
Circuit de Barcelona-Catalunya Montmeló, Catalonia, Spain
Circuit de Monaco La Condamine and Monte Carlo, Monaco
Baku City Circuit Baku, Azerbaijan
Circuit Gilles Villeneuve Montreal, Quebec, Canada
Silverstone Circuit Silverstone, United Kingdom
Red Bull Ring Spielberg, Styria, Austria
Circuit Paul Ricard Le Castellet, Provence-Alpes-Côte d'Azur, France
Hungaroring Mogyoród, Hungary
Circuit de Spa-Francorchamps Stavelot, Belgium
Circuit Zandvoort Zandvoort, Netherlands
Monza Circuit Monza, Italy
Marina Bay Street Circuit Marina Bay, Singapore
Suzuka International Racing Course Suzuka, Mie Prefecture, Japan
Circuit of the Americas Austin, Texas, United States
Autódromo Hermanos Rodríguez Mexico City, Mexico
Autódromo José Carlos Pace São Paulo, Brazil
Yas Marina Circuit Abu 

In [16]:
race_list['circuit_length'] = course_lengths
race_list['date'] = dates
race_list['weather'] = weather_info

In [17]:
print(dates)

['20 March, 2022', '27 March, 2022', '10 April, 2022', '11 September, 2022', '8 May, 2022', '22 May, 2022', '29 May, 2022', '12 June, 2022', '19 June, 2022', '3 July, 2022', '10 July, 2022', '24 July, 2022', '31 July, 2022', '28 August, 2022', '4 September, 2022', '11 September, 2022', '2 October, 2022', '9 October, 2022', '23 October, 2022', '30 October, 2022', '13 November, 2022', '20 November, 2022', '5 March, 2023', '19 March, 2023', '2 April, 2023', '30 April, 2023', '7 May, 2023', '28 May, 2023', '4 June, 2023', '18 June, 2023', '2 July, 2023', '9 July, 2023']


In [18]:
race_list.to_csv('/Users/anirudhkrishna/GitHub/FormulaData/final_race_list.csv', index=False)