In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
import numpy as np
import pandas as pd

In [348]:
driver_loc = "/usr/local/bin/chromedriver"
binary_loc = "/usr/bin/brave-browser"

service = ChromeService(driver_loc)
opts = webdriver.ChromeOptions()
opts.binary_location = binary_loc

In [349]:
input_weblink = "https://outbreaks.globalincidentmap.com/"

In [350]:
driver = webdriver.Chrome(options = opts, service=ChromeService(ChromeDriverManager(version="112.0.5615.49").install()))
driver.get(input_weblink)
wait = WebDriverWait(driver, 10)

# Part 1: Collection of metadata

In the javascript layout, the page is divided into parts, which are labelled by div[1], div[2],... etc. In our case, the tables are located in div[3]. 

Each table in div[3] is stored in a separate div, which is labelled by div[3]/div[1], div[3]/div[2],... etc.

In [351]:
num_of_tables = len(driver.find_elements(By.XPATH, '//*[@id="page_content"]/div[3]/div'))
num_of_cols = 5 # we can see on the website that there are 5 columns
print(num_of_tables) # total number of tables

50


In [352]:
total_rows = 0
for i in range(1, num_of_tables+1): # for each table
    num_of_rows = len(driver.find_elements(By.XPATH, '//*[@id="page_content"]/div[3]/div['+str(i)+']/div[2]/table/tbody/tr'))
    # print(num_of_rows) # number of rows in each table
    total_rows += num_of_rows

# print(total_rows) # total number of rows

In [353]:

for i in range(1, num_of_tables+1): # for each table
    num_of_col = len(driver.find_elements(By.XPATH, '//*[@id="page_content"]/div[3]/div['+str(i)+']/div[2]/table/tbody/tr[1]/td'))
    # print(num_of_col) # number of rows in each table

In [354]:
# data = np.array([total_rows+1, num_of_cols], dtype=str)
metadata = [["" for x in range(num_of_cols)] for y in range(total_rows+1)]

In [355]:
# header for the table

for i in range(1, num_of_cols+1):
    metadata[0][i-1] = driver.find_element(By.XPATH, '//*[@id="page_content"]/div[3]/div[3]/div[2]/table/thead/tr/th['+str(i)+']').text

metadata[0][4] = "DESCRIPTION" # manually add the last column as it is not in the website

print(metadata[0])

['DATE/TIME', 'DETAIL', 'COUNTRY', 'CITY', 'DESCRIPTION']


In [356]:
table_row = 0

for i in range(3, num_of_tables+1): # for each table
    num_of_rows = len(driver.find_elements(By.XPATH, '//*[@id="page_content"]/div[3]/div['+str(i)+']/div[2]/table/tbody/tr'))
    for j in range(1, num_of_rows+1):
        num_of_cols = len(driver.find_elements(By.XPATH, '//*[@id="page_content"]/div[3]/div['+str(i)+']/div[2]/table/tbody/tr['+str(j)+']/td'))
        if num_of_cols > 0:
            table_row += 1
            for k in range(1, num_of_cols+1):
                metadata[table_row][k-1] = driver.find_element(By.XPATH, '//*[@id="page_content"]/div[3]/div['+str(i)+']/div[2]/table/tbody/tr['+str(j)+']/td['+str(k)+']').text
                if k == 2:
                    metadata[table_row][1] = driver.find_element(By.XPATH, '//*[@id="page_content"]/div[3]/div['+str(i)+']/div[2]/table/tbody/tr['+str(j)+']/td[2]/a').get_attribute('href')
            

In [357]:
metadata = np.array(metadata)
meta_df = pd.DataFrame(data=metadata[1:,0:], columns=metadata[0,0:])


meta_df = meta_df[meta_df['COUNTRY'] != ''] # cleaning the data
display(meta_df)

Unnamed: 0,DATE/TIME,DETAIL,COUNTRY,CITY,DESCRIPTION
0,2023-05-12 22:23:12,https://outbreaks.globalincidentmap.com/event_...,United States,"Escanaba, MI, USA",MICHIGAN - Michigan Paper Mill With Fungal Inf...
1,2023-05-12 18:51:49,https://outbreaks.globalincidentmap.com/event_...,Canada,"RattleSnake Point Golf Club, Regional Road 25,...",CANADA - Health Officials Confirm Case Of Hepa...
2,2023-05-11 15:17:00,https://outbreaks.globalincidentmap.com/event_...,Switzerland,"Geneva, Switzerland",WITZERLAND - WHO Declares End To Monkeypox Pub...
3,2023-05-09 09:22:50,https://outbreaks.globalincidentmap.com/event_...,United States,"Sacramento, CA, USA",CALIFORNIA - Confirmed Hospitalizations And Il...
4,2023-05-09 09:21:44,https://outbreaks.globalincidentmap.com/event_...,United States,"Lansing, MI, USA",MICHIGAN - Rare Fungal Outbreak In Michigan Gr...
...,...,...,...,...,...
72,2023-05-04 06:31:28,https://outbreaks.globalincidentmap.com/event_...,Russia,"Moscow, Russia",RUSSIA - Russia Records 4273 Daily Covid Cases...
73,2023-05-04 06:28:13,https://outbreaks.globalincidentmap.com/event_...,India,Itanagar,INDIA - Arunachal Logs 7 New Covid Cases
74,2023-05-04 06:27:34,https://outbreaks.globalincidentmap.com/event_...,India,New Delhi,INDIA - India Reports 3962 Fresh Covid-19 Case...
75,2023-05-04 06:25:54,https://outbreaks.globalincidentmap.com/event_...,United Kingdom,"Glasgow, UK",UNITED KINGDOM - Deaths Involving Covid Contin...


# Part 2: Accessing data from the links in Metadata

Studying the pages opened by the links, we see there are 11 attributes, with 11 values for all 74 links.

In [358]:
# print(len(meta_df)) # total number of links
# print(len(driver.find_elements(By.XPATH, '//*[@id="page_content"]/div[1]/div'))) # no. of keys and values per link

In [359]:
temp = [["" for x in range(11)] for y in range(len(meta_df))]

In [360]:
# creating header
header = ['' for x in range(11)]
driver.get(meta_df.iloc[0,1]) # go to the first link
wait = WebDriverWait(driver, 4)
num_of_keys = len(driver.find_elements(By.XPATH, '//*[@id="page_content"]/div[1]/div'))

for i in range(1, num_of_keys+1):
    if (i%2!=0):
        header[i//2] = driver.find_element(By.XPATH, '//*[@id="page_content"]/div[1]/div['+str(i)+']').text

In [361]:
print(header)

['Event Title', 'Event Type', 'Date Time', 'Country', 'City', 'Infrastructure Affected', 'Severity', 'Latitude', 'Longitude', 'Url', 'Description']


In [362]:
for i in range(0, len(meta_df)):
    driver.get(meta_df.iloc[i,1])
    driver.wait = WebDriverWait(driver, 2)
    
    for j in range(1, 12):
        value = j*2
        temp[i][j-1] = driver.find_element(By.XPATH, '//*[@id="page_content"]/div[1]/div['+str(value)+']').text


In [363]:
data = [header]
data.extend(temp)

In [364]:
data = np.array(data)
df = pd.DataFrame(data=data[1:,0:], columns=data[0,0:])


# df = df[df['COUNTRY'] != ''] # cleaning the data
display(df)

Unnamed: 0,Event Title,Event Type,Date Time,Country,City,Infrastructure Affected,Severity,Latitude,Longitude,Url,Description
0,MICHIGAN - Michigan Paper Mill With Fungal Inf...,General News,2023-05-12 22:23:12,US (United States),"Escanaba, MI, USA",Unknown,Severe,45.7452470,-87.0645800,https://www.insurancejournal.com/news/midwest/...,[AP] MICHIGAN - Michigan Paper Mill With Funga...
1,CANADA - Health Officials Confirm Case Of Hepa...,General News,2023-05-12 18:51:49,CA (Canada),"RattleSnake Point Golf Club, Regional Road 25,...",Unknown,Unknown,43.4834330,-79.8135600,https://www.chch.com/health-officials-confirm-...,[chch.com] CANADA - Health officials confirm c...
2,WITZERLAND - WHO Declares End To Monkeypox Pub...,General News,2023-05-11 15:17:00,CH (Switzerland),"Geneva, Switzerland",Unknown,Severe,46.2043910,6.1431580,https://www.deccanherald.com/international/wor...,[Reuters] SWITZERLAND - WHO declares end to mo...
3,CALIFORNIA - Confirmed Hospitalizations And Il...,General News,2023-05-09 09:22:50,US (United States),"Sacramento, CA, USA",Unknown,Unknown,38.5815720,-121.4944000,https://www.foodpoisoningnews.com/confirmed-ho...,[foodpoisoningnews.com] CALIFORNIA - Confirmed...
4,MICHIGAN - Rare Fungal Outbreak In Michigan Gr...,General News,2023-05-09 09:21:44,US (United States),"Lansing, MI, USA",Unknown,Severe,42.7325350,-84.5555350,https://www.beckershospitalreview.com/public-h...,[beckershospitalreview.com] MICHIGAN - Rare fu...
...,...,...,...,...,...,...,...,...,...,...,...
72,RUSSIA - Russia Records 4273 Daily Covid Cases...,Coronavirus,2023-05-04 06:31:28,RU (Russia),"Moscow, Russia",Unknown,Severe,55.7558260,37.6173000,https://www.azernews.az/region/209446.html,"[azernews] RUSSIA - Russia records 4,273 daily..."
73,INDIA - Arunachal Logs 7 New Covid Cases,Coronavirus,2023-05-04 06:28:13,IN (India),Itanagar,Unknown,Severe,27.0843680,93.6053160,https://www.outlookindia.com/national/arunacha...,[PTI] INDIA - Arunachal Logs 7 New Covid Cases...
74,INDIA - India Reports 3962 Fresh Covid-19 Case...,Coronavirus,2023-05-04 06:27:34,IN (India),New Delhi,Unknown,Severe,28.6357600,77.2244500,https://www.newsbytesapp.com/news/india/india-...,"[news bytes] INDIA - India reports 3,962 fresh..."
75,UNITED KINGDOM - Deaths Involving Covid Contin...,Coronavirus,2023-05-04 06:25:54,UK (United Kingdom),"Glasgow, UK",Unknown,Severe,55.8642370,-4.2518060,https://theorkneynews.scot/2023/05/04/deaths-i...,[theorkneynews.scot] UNITED KINGDOM - Deaths I...


In [365]:
df.to_csv('homepage_data.csv', index=False)
meta_df.to_csv('homepage_meta_data.csv', index=False)

In [368]:
driver.quit()