# Scraping Data from https://en.wikipedia.org/wiki

In [18]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
import re
from datetime import datetime

In [19]:
race_list = pd.read_csv('race_list.csv')

In [20]:
country_to_GP = {
    'Abu Dhabi': 'Abu_Dhabi_Grand_Prix',
    'Argentina': 'Argentine_Grand_Prix',
    'Australia': 'Australian_Grand_Prix',
    'Austria': 'Austrian_Grand_Prix',
    'Azerbaijan': 'Azerbaijan_Grand_Prix',
    'Bahrain': 'Bahrain_Grand_Prix',
    'Belgium': 'Belgian_Grand_Prix',
    'Brazil': 'Brazilian_Grand_Prix',
    'Canada': 'Canadian_Grand_Prix',
    'China': 'Chinese_Grand_Prix',
    'Emilia Romagna': 'Imola_Grand_Prix',
    'France': 'French_Grand_Prix',
    'Germany': 'German_Grand_Prix',
    'Great Britain': 'British_Grand_Prix',
    'Hungary': 'Hungarian_Grand_Prix',
    'Imola': 'Imola_Grand_Prix',
    'India': 'Indian_Grand_Prix',
    'Italy': 'Italian_Grand_Prix',
    'Japan': 'Japanese_Grand_Prix',
    'Korea': 'Korean_Grand_Prix',
    'Malaysia': 'Malaysian_Grand_Prix',
    'Mexico': 'Mexican_Grand_Prix',
    'Miami': 'Miami_Grand_Prix',
    'Monaco': 'Monaco_Grand_Prix',
    'Morocco': 'Moroccan_Grand_Prix',
    'Netherlands': 'Dutch_Grand_Prix',
    'Portugal': 'Portuguese_Grand_Prix',
    'Qatar': 'Qatar_Grand_Prix',
    'Russia': 'Russian_Grand_Prix',
    'San Marino': 'San_Marino_Grand_Prix',
    'Saudi Arabia': 'Saudi_Arabian_Grand_Prix',
    'Singapore': 'Singapore_Grand_Prix',
    'South Africa': 'South_African_Grand_Prix',
    'Spain': 'Spanish_Grand_Prix',
    'Sweden': 'Swedish_Grand_Prix',
    'Switzerland': 'Swiss_Grand_Prix',
    'Turkey': 'Turkish_Grand_Prix',
    'United Arab Emirates': 'Abu_Dhabi_Grand_Prix',
    'United Kingdom': 'British_Grand_Prix',
    'United States': 'United_States_Grand_Prix',
    '70th Anniversary': 'British_Grand_Prix'
}

In [21]:
wet = ['rain', 'wet', 'showers', 'damp', 'thunderstorms', 'rainy', 'moist', 'dewy', 'drizzly', 'muggy', 'clammy', 'soggy', 'drenched']
cloudy = ['cloudy', 'overcast', 'gray', 'gloomy', 'dull', 'hazy', 'humid', 'misty', 'foggy', 'smoky', 'nebulous', 'obscured', 'shadowy', 'cool']
sunny = ['sunny', 'dry', 'clear', 'bright', 'radiant', 'sunlit', 'sunshine', 'shining', 'warm', 'hot', 'blazing', 'scorching']



def parse_weather_info(weather_string):
    weather_words = re.split(r'[^a-zA-Z]', weather_string)
    for i, word in enumerate(weather_words):
        weather_words[i] = word.lower()
    for word in wet:
        if word in weather_words:
            return 'wet'
    for word in cloudy:
        if word in weather_words:
            return 'cloudy'
    for word in sunny:
        if word in weather_words:
            return 'sunny'
    return 'sunny'
        

def parse_date(date_string):
    modified_string = re.sub(r'[^a-zA-Z0-9]', ' ', date_string)
    modified_string = re.sub(r'\s+', ' ', modified_string)
    date_words = modified_string.split()

    if len(date_words)!=3:
        return None

    if date_words[1].isdigit() and not date_words[0].isdigit():
        temp = date_words[0]
        date_words[0] = date_words[1]
        date_words[1] = temp
    date = ' '.join(date_words[:2])+', '+date_words[2]
    return date


# def parse_date(date_string):
#     try:
#         date = datetime.strptime(date_string, '%B %d, %Y')
#         return date.strftime('%B %d, %Y')
#     except ValueError:
#         return None
    

In [22]:
wiki_links = []

for index, row in race_list.iterrows():
    wiki_link = 'https://en.wikipedia.org/wiki/' + str(row['season']) + '_' + country_to_GP[row['location']]
    wiki_links.append(wiki_link)

race_list['wiki_link'] = wiki_links
    

In [23]:
print(race_list.shape)

(32, 4)


In [24]:
weather_info = []
dates = []
course_lengths = []
circuit_names = []

for link in wiki_links:
    try:
        for table_index in range(4):
            df = pd.read_html(link)[table_index]
            if 'Date' in df.iloc[:, 0].values:
                # print(df.iloc[6,1])
                try:
                    row_index_weather = df.iloc[:, 0].values.tolist().index('Weather')
                    weather = parse_weather_info(df.iloc[row_index_weather, 1])
                    weather_info.append(weather)
                except:
                    weather_info.append(None)
                try:
                    row_index_circuit_name = df.iloc[:, 0].values.tolist().index('Location')
                    circuit_name = df.iloc[row_index_circuit_name, 1]
                    circuit_names.append(circuit_name)
                except:
                    circuit_names.append(None)
                try:
                    row_index_dates = df.iloc[:, 0].values.tolist().index('Date')
                    date = parse_date(df.iloc[row_index_dates, 1])
                    dates.append(date)
                except:
                    dates.append(None)
                try:
                    row_index_course_lengths = df.iloc[:, 0].values.tolist().index('Course length')
                    course_lengths.append(df.iloc[row_index_course_lengths, 1])
                except:
                    course_lengths.append(None)
                break
    except:
        weather_info.append(None)
        dates.append(None)
        course_lengths.append(None)
        circuit_names.append(None)



In [25]:
race_list['circuit_length'] = course_lengths
race_list['date'] = dates
race_list['weather'] = weather_info
race_list['circuit_full_name'] = circuit_names

In [26]:
print(race_list)

    season  round        location  \
0     2022      1         Bahrain   
1     2022      2    Saudi Arabia   
2     2022      3       Australia   
3     2022      4  Emilia Romagna   
4     2022      5           Miami   
5     2022      6           Spain   
6     2022      7          Monaco   
7     2022      8      Azerbaijan   
8     2022      9          Canada   
9     2022     10   Great Britain   
10    2022     11         Austria   
11    2022     12          France   
12    2022     13         Hungary   
13    2022     14         Belgium   
14    2022     15     Netherlands   
15    2022     16           Italy   
16    2022     17       Singapore   
17    2022     18           Japan   
18    2022     19   United States   
19    2022     20          Mexico   
20    2022     21          Brazil   
21    2022     22       Abu Dhabi   
22    2023      1         Bahrain   
23    2023      2    Saudi Arabia   
24    2023      3       Australia   
25    2023      4      Azerbaijan   
2

In [27]:
race_list.to_csv('/Users/anirudhkrishna/GitHub/FormulaData/final_race_list.csv', index=False)