# Scrapping Recalls from Government of Canada Food Recalls and Safety Alerts
### Part I - Extracting Recall Links

Date of scrapping: 2022-08-25

In [1]:
from selenium import webdriver # Initialize web browser
from selenium.webdriver.common.by import By # Search HTML elements using specific parameters
from selenium.webdriver.common.keys import Keys # Provide keys in the keyboard like RETURN, F1, ALT etc.
from selenium.webdriver.support.ui import WebDriverWait # Wait for a page to load
from selenium.webdriver.support import expected_conditions as EC # Specify what you are looking for on a specific page in order to determine that the webpage has loaded.
from selenium.webdriver.support.ui import Select # Select dropdown box values
from bs4 import BeautifulSoup
import pandas as pd
import time

# Initialize Selenium Chromedriver
DRIVER_PATH = 'C:\\webdrivers\\chromedriver.exe'
driver = webdriver.Chrome(executable_path = DRIVER_PATH)
driver.get('https://recalls-rappels.canada.ca/en/search/site?search_api_fulltext=&archived=1&f%5B0%5D=category%3A144&page=0')

  driver = webdriver.Chrome(executable_path = DRIVER_PATH)


### Parsing HTML page

In [28]:
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')

page_links = soup.find_all('div', class_='search-result views-row')

<div class="search-result views-row"><div class="views-field views-field-field-recall-type"><span class="field-content"><img alt="Food alert or recall" class="home-recent-icon" src="/sites/default/files/2021-11/icon-food.svg"/>
<span class="homepage-recent"><a href="/en/alert-recall/mrakovic-fine-foods-brand-natural-chicken-burger-recalled-due-undeclared-egg" hreflang="en">Mrakovic Fine Foods brand Natural Chicken Burger recalled due to undeclared egg</a><br/>
<div class="search-excerpt-wrapper">
      Mrakovic Fine Foods brand Natural Chicken Burger recalled due to undeclared egg.

The recalled product has been sold in Ontario.


  </div>
<div class="type-wrapper">
<span class="label label-danger">Recall</span><span class="ar-type">Notification | 2022-08-24
    </span>
</div>
</span></span></div></div>

### Extracting URL

In [39]:
'https://recalls-rappels.canada.ca'+ page_links[0].find('a').get('href')

'https://recalls-rappels.canada.ca/en/alert-recall/mrakovic-fine-foods-brand-natural-chicken-burger-recalled-due-undeclared-egg'

### Extracting Recall Title

In [44]:
page_links[0].find('a').text

'Mrakovic Fine Foods brand Natural Chicken Burger recalled due to undeclared egg'

### Extracting Additional Info

In [47]:
page_links[0].find('div', class_='search-excerpt-wrapper').text.strip().replace('\n\n', ' ')

'Mrakovic Fine Foods brand Natural Chicken Burger recalled due to undeclared egg. The recalled product has been sold in Ontario.'

### Extracting Date of Recall

In [51]:
page_links[0].find('span', class_='ar-type').text.strip().replace('Notification | ', '')

'2022-08-24'

## Putting it all together

In [84]:
recalled = []

for i in range(0,311):
    driver.get(f'https://recalls-rappels.canada.ca/en/search/site?search_api_fulltext=&archived=1&f%5B0%5D=category%3A144&page={i}')
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    # time.sleep(0.5)
    page_links = soup.find_all('div', class_='search-result views-row')
    
    for link in page_links:
        notification = {'link': 'https://recalls-rappels.canada.ca'+ link.find('a').get('href'),
                   'title': link.find('a').text,
                   'recall_info': link.find('div', class_='search-excerpt-wrapper').text.strip().replace('\n\n', ' '),
                   'date': link.find('span', class_='ar-type').text.strip().replace('Notification | ', '').replace('Food recall warning | ', ''),
                  }
        recalled.append(notification)

recalls = pd.DataFrame(recalled)
recalls

Unnamed: 0,link,title,recall_info,date
0,https://recalls-rappels.canada.ca/en/alert-rec...,Mrakovic Fine Foods brand Natural Chicken Burg...,Mrakovic Fine Foods brand Natural Chicken Burg...,2022-08-24
1,https://recalls-rappels.canada.ca/en/alert-rec...,"Life120 brand ""Ricotta Di Bufala Campana Dop"" ...","Life120 brand ""Ricotta Di Bufala Campana Dop"" ...",2022-08-24
2,https://recalls-rappels.canada.ca/en/alert-rec...,Food Recall Warning (Allergen) – Mastro San Da...,The affected product is being recalled from th...,2022-08-19
3,https://recalls-rappels.canada.ca/en/alert-rec...,Food Recall Warning (Allergen) – Cache Cuisine...,The affected product is being recalled from th...,2022-08-19
4,https://recalls-rappels.canada.ca/en/alert-rec...,Delizia brand Vegetarian Ham recalled due to u...,Delizia brand Vegetarian Ham recalled due to u...,2022-08-19
...,...,...,...,...
4660,https://recalls-rappels.canada.ca/en/alert-rec...,Archived – Health Canada reminds parents of ...,Archived – Health Canada reminds parents of sc...,Information update | 2007-08-29
4661,https://recalls-rappels.canada.ca/en/alert-rec...,Archived - Health Canada has released its po...,Archived - Health Canada has released its posi...,Information update | 2007-08-10
4662,https://recalls-rappels.canada.ca/en/alert-rec...,Archived – Health Canada reminds Canadians a...,Archived – Health Canada reminds Canadians abo...,Information update | 2007-08-01
4663,https://recalls-rappels.canada.ca/en/alert-rec...,Archived - Health Canada reminds Canadians a...,Archived - Health Canada reminds Canadians abo...,Information update | 2007-06-21


In [86]:
recalls.isnull().sum()

link           0
title          0
recall_info    0
date           0
dtype: int64

### Save scrapped recalls to csv file
Succesfully scrapped all items from page 1 to 311. In total 4665 of 4680 items.

In [85]:
recalls.to_csv('recalls-links-2022.csv')

## Reference

In [54]:
print(type(page_links))
print(type(page_links[0]))
print(len(page_links))

<class 'bs4.element.ResultSet'>
<class 'bs4.element.Tag'>
15
