# Part 0: Setting up Selenium and importing libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
import numpy as np
import pandas as pd
import datetime
from tqdm import tqdm

The changes I made below are specific to the system and version of modules I am using. I suggest spending time to set up selenium and web drivers before starting the data collection code since, as long as the website does not change, the code will work for any system.

## For Brave Browser

In [2]:
# driver_loc = "/usr/local/bin/chromedriver"
# binary_loc = "/usr/bin/brave-browser"

# service = ChromeService(driver_loc)
# opts = webdriver.ChromeOptions()
# opts.binary_location = binary_loc
# #opts.add_argument("--headless")

# driver = webdriver.Chrome(options = opts, service=ChromeService(ChromeDriverManager(version="112.0.5615.49").install()))

## For Firefox

In [23]:
from selenium.webdriver.firefox.options import Options as FirefoxOptions

options = FirefoxOptions()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)

# Part 1: Collecting the necessary URLs for scraping

In [4]:
def generate_link(start_date, end_date):
    start_date = start_date.strftime("%d %b %Y").replace(" ", "%20") # writing the date in the format that the website uses
    end_date = end_date.strftime("%d %b %Y").replace(" ", "%20") # same
    link = f"https://outbreaks.globalincidentmap.com/map?start_date={start_date}&end_date={end_date}" # generating the link for the website
    return link

In [5]:
# getting data from Jan 1 1990 to Jun 1 2023, with 3 months interval

start_date = datetime.datetime(1990, 1, 1) # start date
end_date = datetime.datetime(2023, 6, 1) # end date
interval = 90 # interval of 90 days

links = [] # to collect all the links we would generate considering the interval

while start_date < end_date: # while the start date is less than the end date
    links.append(generate_link(start_date, start_date + datetime.timedelta(days=interval))) # append the link to the list
    start_date += datetime.timedelta(days=90) # add 90 days to the start date

# Like this, we have all the links we need to scrape the data from the website

# Part 2: Collection of metadata

## Creating functions to collect the metadata

### Initializing the collection process of Metadata

In [6]:
def initialize():
    num_of_tables = len(driver.find_elements(By.XPATH, '//*[@id="page_content"]/div[3]/div'))
    num_of_cols = 5 # we can see on the website that there are 5 columns

    # counting the number of rows in the table
    total_rows = 0
    for i in range(1, num_of_tables+1): # for each table
        num_of_rows = len(driver.find_elements(By.XPATH, '//*[@id="page_content"]/div[3]/div['+str(i)+']/div[2]/table/tbody/tr'))
        total_rows += num_of_rows

    
    # creating a 2D array to store the metadata
    metadata = [["" for x in range(num_of_cols)] for y in range(total_rows+1)]

    return metadata, num_of_tables, num_of_cols

In [7]:
def get_header(metadata, num_of_cols):
    # header for the table
    for i in range(1, num_of_cols+1):
        metadata[0][i-1] = driver.find_element(By.XPATH, '//*[@id="page_content"]/div[3]/div[2]/div[2]/table/thead/tr/th['+str(i)+']').text

    metadata[0][4] = "DESCRIPTION" # manually add the last column as it is not written explictly in the website

In [8]:
def collect_metadata():

    metadata, num_of_tables, num_of_cols = initialize() # initializing the variables
    get_header(metadata, num_of_cols) # getting the header of the table


    # getting the metadata from the table

    table_row = 0 # to keep track of the row number of the 2d array

    for i in range(2, num_of_tables+1): # for each table
        num_of_rows = len(driver.find_elements(By.XPATH, '//*[@id="page_content"]/div[3]/div['+str(i)+']/div[2]/table/tbody/tr')) # number of rows in the table

        for j in range(1, num_of_rows+1): # for each row
            num_of_cols = len(driver.find_elements(By.XPATH, '//*[@id="page_content"]/div[3]/div['+str(i)+']/div[2]/table/tbody/tr['+str(j)+']/td')) # number of columns in the table
            if num_of_cols > 0: # if the row is not empty
                table_row += 1 # increment the row number

                for k in range(1, num_of_cols+1): # for each column
                    metadata[table_row][k-1] = driver.find_element(By.XPATH, '//*[@id="page_content"]/div[3]/div['+str(i)+']/div[2]/table/tbody/tr['+str(j)+']/td['+str(k)+']').text # extracting
                    if k == 2: # if it is the second column, we need to extract the link
                        metadata[table_row][1] = driver.find_element(By.XPATH, '//*[@id="page_content"]/div[3]/div['+str(i)+']/div[2]/table/tbody/tr['+str(j)+']/td[2]/a').get_attribute('href') # extracting

    metadata = np.array(metadata) # converting to numpy array
    meta_df = pd.DataFrame(data=metadata[1:,0:], columns=metadata[0,0:]) # converting to pandas dataframe


    meta_df = meta_df[meta_df['COUNTRY'] != ''] # cleaning the data by removing empty rows
    return meta_df

In [9]:
Meta_df = pd.DataFrame() # creating an empty dataframe

for input_weblink in tqdm(links): # for each link
    driver.get(input_weblink) # open the link
    temp_df = collect_metadata() # collect the metadata
    Meta_df = pd.concat([Meta_df, temp_df], ignore_index=True) # append the metadata of each page to the dataframe

100%|██████████████████████████████████████| 136/136 [4:41:24<00:00, 124.15s/it]


In [10]:
display(Meta_df)

Unnamed: 0,DATE/TIME,DETAIL,COUNTRY,CITY,DESCRIPTION
0,2009-06-11 16:41:32,https://outbreaks.globalincidentmap.com/event_...,United Kingdom,Birmingham,UNITED KINGDOM :: Further swine flu cases conf...
1,2009-06-08 19:33:59,https://outbreaks.globalincidentmap.com/event_...,United States,"Oswego County, NY",NEW YORK :: Oswego County Child Has H1N1 Influ...
2,2009-06-17 03:27:33,https://outbreaks.globalincidentmap.com/event_...,Brazil,Sao Paulo,BRAZIL - Brazil Finds New Strain Of H1N1 Virus
3,2009-06-19 01:09:47,https://outbreaks.globalincidentmap.com/event_...,Saudi Arabia,Jeddah,SAUDI ARABIA :: Swine Flu Hits 2 Bruneians In ...
4,2009-06-19 01:06:22,https://outbreaks.globalincidentmap.com/event_...,Papua New Guinea,Papua New Guinea,PAPUA NEW GUINEA :: One PNG case of swine flu ...
...,...,...,...,...,...
39778,2023-04-10 14:04:30,https://outbreaks.globalincidentmap.com/event_...,Peru,Lima,PERU - Covid-19 Cases On The Rise In Peru - De...
39779,2023-04-10 13:58:09,https://outbreaks.globalincidentmap.com/event_...,South Korea,"Seoul, South Korea",SOUTH KOREA - Daily Covid-19 Cases Above 4000 ...
39780,2023-04-09 15:12:20,https://outbreaks.globalincidentmap.com/event_...,India,"Delhi, India",INDIA - Delhi Adds 699 Covid Cases - Positivit...
39781,2023-04-09 06:45:40,https://outbreaks.globalincidentmap.com/event_...,Russia,Moscow,RUSSIA - Russias COVID-19 case tally grows by ...


In [11]:
Meta_df.to_csv('filtered_meta_data.csv', index=False)

# Part 3: Accessing data from the links in Metadata

Studying the pages opened by the links, we see there are 11 attributes, with 11 values for all 74 links.

In [17]:
def collect_data(meta_df):

    temp = [["" for x in range(11)] for y in range(len(meta_df))] # to store the attributes
    
    # creating header
    header = ['' for x in range(11)]
    driver.get(meta_df.iloc[0,1]) # go to the first link
    num_of_keys = len(driver.find_elements(By.XPATH, '//*[@id="page_content"]/div[1]/div'))

    for i in range(1, num_of_keys+1): # iterating over the fields in the grid
        if (i%2!=0): # collecting the values at the odd indices as headers
            header[i//2] = driver.find_element(By.XPATH, '//*[@id="page_content"]/div[1]/div['+str(i)+']').text

    # getting the data
    for i in range(0, len(meta_df)):
        driver.get(meta_df.iloc[i,1])
        for j in range(1, 12):
            value = j*2 # collecting the values of even indices of the grid as data under the corresponding headers
            temp[i][j-1] = driver.find_element(By.XPATH, '//*[@id="page_content"]/div[1]/div['+str(value)+']').text
            
        
        # failsafe data recovery
        if i%500 == 0: # at every 500th data point
            temp_df = pd.DataFrame(data=temp[0:i+1], columns=header)
            temp_df.to_csv('temp_data.csv', index=False) # save the progress
        
        #dynamic progress bar
        print(f"\rProgress: {i+1}/{len(meta_df)}", end="")


    data = [header]
    data.extend(temp)
    data = np.array(data)
    df = pd.DataFrame(data=data[1:,0:], columns=data[0,0:])
    
    #extracting only year from date and putting it in a new column at the end
    df['Year'] = df['Date Time'].str[:4]

    return df

In [None]:
df = collect_data(Meta_df)
display(df)

In [54]:
display(df)

Unnamed: 0,Event Title,Event Type,Date Time,Country,City,Infrastructure Affected,Severity,Latitude,Longitude,Url,Description,Year
0,UNITED KINGDOM :: Further swine flu cases conf...,Foot And Mouth Disease,2009-06-11 16:41:32,UK (United Kingdom),Birmingham,Unknown,Unknown,52.483,-1.894,http://news.bbc.co.uk/2/hi/uk_news/england/wes...,[BBC.co.uk] UNITED KINGDOM :: Further swine f...,2009
1,NEW YORK :: Oswego County Child Has H1N1 Influ...,Foot And Mouth Disease,2009-06-08 19:33:59,US (United States),"Oswego County, NY",Unknown,Unknown,43.483,-76.178,http://www.wwnytv.com/news/local/47219312.html,[WWNYTV.com] NEW YORK :: Oswego County Child ...,2009
2,BRAZIL - Brazil Finds New Strain Of H1N1 Virus,Notable H1N1 News And Announcements,2009-06-17 03:27:33,BR (Brazil),Sao Paulo,Unknown,Unknown,-23.549,-46.639,http://www.breitbart.com/article.php?id=CNG.59...,[Breitbart] BRAZIL - Brazil finds new strain o...,2009
3,SAUDI ARABIA :: Swine Flu Hits 2 Bruneians In ...,Swine Flu Confirmed Cases,2009-06-19 01:09:47,SA (Saudi Arabia),Jeddah,Unknown,Unknown,21.543,39.173,http://www.brudirect.com/index.php/20090618108...,[BRUDirect.com] SAUDI ARABIA :: Swine Flu Hit...,2009
4,PAPUA NEW GUINEA :: One PNG case of swine flu ...,Swine Flu Confirmed Cases,2009-06-19 01:06:22,PG (Papua New Guinea),Papua New Guinea,Unknown,Unknown,-5.805,144.785,http://au.news.yahoo.com/a/-/world/5664464,[AU.News.Yahoo.com] PAPUA NEW GUINEA :: One P...,2009
...,...,...,...,...,...,...,...,...,...,...,...,...
39778,PERU - Covid-19 Cases On The Rise In Peru - De...,Coronavirus,2023-04-10 14:04:30,PE (Peru),Lima,Unknown,Severe,-12.04318,-77.02824,https://en.mercopress.com/2023/04/10/covid-19-...,[Xinhua] PERU - Covid-19 cases on the rise in ...,2023.0
39779,SOUTH KOREA - Daily Covid-19 Cases Above 4000 ...,Coronavirus,2023-04-10 13:58:09,KR (South Korea),"Seoul, South Korea",Unknown,Severe,37.551891,126.991794,http://world.kbs.co.kr/service/news_view.htm?S...,[KBS] SOUTH KOREA - Daily COVID-19 Cases above...,2023.0
39780,INDIA - Delhi Adds 699 Covid Cases - Positivit...,Coronavirus,2023-04-09 15:12:20,IN (India),"Delhi, India",Unknown,Severe,28.704059,77.10249,https://www.indiatoday.in/coronavirus-outbreak...,[Press Trust of India] INDIA - Delhi adds 699 ...,2023.0
39781,RUSSIA - Russias COVID-19 case tally grows by ...,Coronavirus,2023-04-09 06:45:40,RU (Russia),Moscow,Unknown,Severe,55.75222,37.61556,https://tass.com/society/1601745,[TASS] RUSSIA - Russia’s COVID-19 case tally g...,2023.0


In [55]:
# save the dataframes to csv files
df.to_csv('filtered_data.csv', index=False)

In [56]:
driver.quit()