# Scrapping Recalls from Government of Canada Food Recalls and Safety Alerts
### Part II - Extracting Recall Details

In [1]:
from selenium import webdriver # Initialize web browser
from selenium.webdriver.common.by import By # Search HTML elements using specific parameters
from selenium.webdriver.common.keys import Keys # Provide keys in the keyboard like RETURN, F1, ALT etc.
from selenium.webdriver.support.ui import WebDriverWait # Wait for a page to load
from selenium.webdriver.support import expected_conditions as EC # Specify what you are looking for on a specific page in order to determine that the webpage has loaded.
from selenium.webdriver.support.ui import Select # Select dropdown box values
from bs4 import BeautifulSoup
import pandas as pd
import time

pd.set_option('display.max_colwidth', None)

# Initialize Selenium Chromedriver
DRIVER_PATH = 'C:\\webdrivers\\chromedriver.exe'
driver = webdriver.Chrome(executable_path = DRIVER_PATH)
driver.get('https://recalls-rappels.canada.ca/en/search/site?search_api_fulltext=&archived=1&f%5B0%5D=category%3A144&page=0')

# Load data
recalls = pd.read_csv(r'recalls-2022-links.csv')

# Dropping first column
recalls.drop(recalls.columns[0], axis=1, inplace=True)

# Dropping rows that contain advisories instead of recalls
recalls.drop(recalls.index[4591:], axis=0, inplace=True)

# Dropping rows that do not contain dates as they are advisories instead of recalls
non_recalls_index = recalls[recalls['date'].str.contains('[a-z]')].index
recalls.drop(non_recalls_index, axis=0, inplace=True)

# Transforming date column to datetime format
recalls['date'] = pd.to_datetime(recalls.date, format='%Y-%m-%d')

recalls['title'] = recalls['title'].str.strip()
recalls.head()

Unnamed: 0,link,title,recall_info,date
0,https://recalls-rappels.canada.ca/en/alert-recall/mrakovic-fine-foods-brand-natural-chicken-burger-recalled-due-undeclared-egg,Mrakovic Fine Foods brand Natural Chicken Burger recalled due to undeclared egg,Mrakovic Fine Foods brand Natural Chicken Burger recalled due to undeclared egg. The recalled product has been sold in Ontario.,2022-08-24
1,https://recalls-rappels.canada.ca/en/alert-recall/life120-brand-ricotta-di-bufala-campana-dop-recalled-due-generic-e-coli,"Life120 brand ""Ricotta Di Bufala Campana Dop"" recalled due to generic E. coli","Life120 brand ""Ricotta Di Bufala Campana Dop"" recalled due to generic E. coli. The recalled product has been sold in Quebec.",2022-08-24
2,https://recalls-rappels.canada.ca/en/alert-recall/food-recall-warning-allergen-mastro-san-daniele-brand-charcuterie-trio-prosciutto,"Food Recall Warning (Allergen) – Mastro San Daniele brand Charcuterie Trio – Prosciutto Cotto, Capocollo, Mortadella recalled due to undeclared milk","The affected product is being recalled from the marketplace because it contains milk which is not declared on the label. The recalled product has been sold in British Columbia, Alberta, Ontario, New Brunswick, Nova Scotia, and may have been…",2022-08-19
3,https://recalls-rappels.canada.ca/en/alert-recall/food-recall-warning-allergen-cache-cuisine-brand-pro-licious-protein-pancake-waffle,Food Recall Warning (Allergen) – Cache Cuisine brand Pro-licious Protein Pancake & Waffle Mix recalled due to improperly undeclared milk,The affected product is being recalled from the marketplace because it contains milk which is improperly declared on the label. The recalled product has been sold nationally.,2022-08-19
4,https://recalls-rappels.canada.ca/en/alert-recall/delizia-brand-vegetarian-ham-recalled-due-undeclared-gluten-and-wheat-0,Delizia brand Vegetarian Ham recalled due to undeclared gluten and wheat,"Delizia brand Vegetarian Ham recalled due to undeclared gluten and wheat. The recalled product has been sold in Alberta, British Columbia, Manitoba and Ontario.",2022-08-19


### Stage 1: Extracting recalls from 2022-08-24 to 2021-10-21

In [None]:
recalls_p_2022_2021 = recalls[recalls['date'] > '2021-10-20'].reset_index()

titles = []
issues = []
audiences = []
companies = []
categories = []
audiences_2 = []
recall_classes = []

for link in recalls_p_2022_2021['link']:
    driver.get(link)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    details = soup.find_all('dd', class_='paddingNone')
    details_alt = soup.find_all('div', class_='field--items')

    try:
        titles.append(soup.find('h1', class_='gc-thickline').text.strip().replace('\n',''))
    except:
        titles.append(None)
    try:
        issues.append(details[3].text.strip())
    except:
        try:
            issues.append(details_alt[1].find('div', class_='field--item').text)
        except:
            issues.append(None)
    try:
        audiences.append(details[8].text.strip())
    except:
        try:
            audiences.append(details_alt[4].find('div', class_='field--item').text)
        except:
            audiences.append(None)
    try:
        categories.append(details[3].text.strip())
    except:
        audiences.append(None)    
    
    try:
        companies.append(details[6].text.strip())
    except:
        companies.append(None)
    try:
        audiences_2.append(details[7].text.strip())
    except:
        audiences_2.append(None)
    try:
        recall_classes.append(details[4].text.strip())
    except:
        try:
            recall_classes.append(details_alt[5].find('div', class_='field--item').text)
        except:
            recall_classes.append(None)

recalls_ca = pd.DataFrame({'title': titles, 'issue':issues, 'audience': audiences, 'company': companies, 'category': categories,
                              'audience_2': audiences_2, 'recall_class': recall_classes})

merged_01 = pd.merge(dataframe, recalls_ca, left_on='title', right_on='title', how='inner')
merged_01.drop_duplicates('title', inplace=True)

return merged_01.to_csv('recalls-p-2022-2021.csv')

In [None]:
recalls_2021_10_20 = recalls[recalls['date'] > '2021-10-20']

titles = []
companies = []
issues = []
audiences = []
categories = []
audiences_2 = []
recall_classes = []


for link in recalls_2021_10_20['link']:
    driver.get(link)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    try:
        titles.append(soup.find('h1', class_='gc-thickline').text.strip().replace('\n',''))
    except:
        titles.append(None)
    
    details = soup.find_all('div', class_='field--items')
    try:
        companies.append(details[0].find('div', class_='field--item').text)
    except:
        companies.append(None)
    try:
        issues.append(details[1].find('div', class_='field--item').text)
    except:
        issues.append(None)
    try:
        audiences.append(details[2].find('div', class_='field--item').text)
    except:
        audiences.append(None)
    try:
        categories.append(details[3].find('div', class_='field--item').text)
    except:
        categories.append(None)
    try:
        audiences_2.append(details[4].find('div', class_='field--item').text)
    except:
        audiences_2.append(None)
    try:
        recall_classes.append(details[5].find('div', class_='field--item').text)
    except:
        recall_classes.append(None)

recalls_ca = pd.DataFrame({'title': titles, 'company': companies, 'issue':issues, 
                           'audiences': audiences, 'category': categories,
                           'audience_2': audiences_2, 'recall_class': recall_classes})                        

merged = pd.merge(recalls_2021_10_20, recalls_ca, right_on='title', left_on='title', how='inner')
merged.drop_duplicates('title', inplace=True)
merged.to_csv('recalls-2022-2021.csv')

In [None]:
merged.shape

### Stage 2: Extracting recalls from 2021-10-15 to 2011-01-04
##### Splitting the dataframe into years prevents the selenium driver from crashing because there are too many links to access (4239). Therefore, data is split in recalls per year and a function that takes two parameters, year (str or int) and dataframe belonging to that year is created.

In [3]:
recalls_2011_01_04 = recalls[recalls['date'] < '2021-10-20'].reset_index()
recalls_2011_01_04.shape

(4239, 5)

In [3]:
recalls_2021 = recalls[(recalls['date'] < '2021-10-20') & (recalls['date'] > '2021-01-01')].reset_index()
print('2021 recalls: ' + str(recalls_2021.shape[0]))

recalls_2020 = recalls[(recalls['date'] < '2021-01-01') & (recalls['date'] >= '2020-01-01')].reset_index()
print('2020 recalls: ' + str(recalls_2020.shape[0]))

recalls_2019 = recalls[(recalls['date'] < '2020-01-01') & (recalls['date'] >= '2019-01-01')].reset_index()
print('2019 recalls: ' + str(recalls_2019.shape[0]))

recalls_2018 = recalls[(recalls['date'] < '2019-01-01') & (recalls['date'] >= '2018-01-01')].reset_index()
print('2018 recalls: ' + str(recalls_2018.shape[0]))

recalls_2017 = recalls[(recalls['date'] < '2018-01-01') & (recalls['date'] >= '2017-01-01')].reset_index()
print('2017 recalls: ' + str(recalls_2017.shape[0]))

recalls_2016 = recalls[(recalls['date'] < '2017-01-01') & (recalls['date'] >= '2016-01-01')].reset_index()
print('2016 recalls: ' + str(recalls_2016.shape[0]))

recalls_2015 = recalls[(recalls['date'] < '2016-01-01') & (recalls['date'] >= '2015-01-01')].reset_index()
print('2015 recalls: ' + str(recalls_2015.shape[0]))

recalls_2014 = recalls[(recalls['date'] < '2015-01-01') & (recalls['date'] >= '2014-01-01')].reset_index()
print('2014 recalls: ' + str(recalls_2014.shape[0]))

recalls_2013 = recalls[(recalls['date'] < '2014-01-01') & (recalls['date'] >= '2013-01-01')].reset_index()
print('2013 recalls: ' + str(recalls_2013.shape[0]))

recalls_2012 = recalls[(recalls['date'] < '2013-01-01') & (recalls['date'] >= '2012-01-01')].reset_index()
print('2012 recalls: ' + str(recalls_2012.shape[0]))

recalls_2011 = recalls[(recalls['date'] < '2012-01-01') & (recalls['date'] >= '2011-01-01')].reset_index()
print('2011 recalls: ' + str(recalls_2011.shape[0]))

2021 recalls: 186
2020 recalls: 230
2019 recalls: 357
2018 recalls: 302
2017 recalls: 348
2016 recalls: 413
2015 recalls: 385
2014 recalls: 619
2013 recalls: 444
2012 recalls: 499
2011 recalls: 456


In [4]:
# Function extracts recalls per year. Takes two parameters, year (str or int) and dataframe belonging to that year. Returns dataframe and saved .csv file 
def extract_recalls(year, dataframe):
    titles = []
    issues = []
    audiences = []
    companies = []
    distributions = []
    recall_classes = []

    for link in dataframe['link']:
        driver.get(link)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        details = soup.find_all('dd', class_='paddingNone')
        details_alt = soup.find_all('div', class_='field--items')

        try:
            titles.append(soup.find('h1', class_='gc-thickline').text.strip().replace('\n',''))
        except:
            titles.append(None)
        try:
            issues.append(details[3].text.strip())
        except:
            try:
                issues.append(details_alt[1].find('div', class_='field--item').text)
            except:
                issues.append(None)
        try:
            audiences.append(details[8].text.strip())
        except:
            try:
                audiences.append(details_alt[4].find('div', class_='field--item').text)
            except:
                audiences.append(None)
        try:
            companies.append(details[6].text.strip())
        except:
            companies.append(None)
        try:
            distributions.append(details[7].text.strip())
        except:
            distributions.append(None)
        try:
            recall_classes.append(details[4].text.strip())
        except:
            try:
                recall_classes.append(details_alt[5].find('div', class_='field--item').text)
            except:
                recall_classes.append(None)

    recalls_ca = pd.DataFrame({'title': titles, 'issue':issues, 'audience': audiences, 'company': companies,
                                  'distribution': distributions, 'recall_class': recall_classes})

    merged_02 = pd.merge(dataframe, recalls_ca, left_on='title', right_on='title', how='inner')
    merged_02.drop_duplicates('title', inplace=True)
    
    return merged_02.to_csv(f'recalls-{year}.csv')

In [9]:
# Input year (str or int) and dataframe
extract_recalls('2011', recalls_2011)