# Scraping UK asylum data 2021

This notebook scraps the tribunal decision's data related to asylum applications from https://tribunalsdecisions.service.gov.uk/

Create an *.sh file to run the env. ./mac_env.sh from terminal

In [None]:
## To RUN IN SHELL

# Virtual environment 
conda env list
conda activate tfm

# Libraries needed
conda install selenium
conda install beautifulsoup4
conda install lxml
pip install pandas
pip install tqdm
pip install whois
pip install builtwith
conda install -c conda-forge geckodriver

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By

import requests
from bs4 import BeautifulSoup
import time
import re
import json
import pandas as pd
import whois
import sys
import datetime
from tqdm import tqdm
from datetime import timedelta

import concurrent.futures

# What environment am I using?
print(sys.executable)

/Users/albertamurgopacheco/anaconda3/bin/python


Create a function to obtain the data in the landing page using beautifulSoup.

In [3]:
def getData(htmlSource):
    """
    getData gets the mouse over links to the tribunal decisions & the dates

    :param htmlSource: Source HTML for the page 
    :return: data as a list of tuples
    """
    
    # SCRAPING MOUSE OVER LINKS TO DECISIONS

    # Scraping tribunal decision names with BeautifulSoup
    soup = BeautifulSoup(htmlSource, 'html')
    
    linksList = re.findall(r'(<a href="/utiac/(.*?)"></a>)', htmlSource)

    linksList = [i[1] for i in linksList]
    #print("Number of links:",len(linksList))
    #print(linksList)

    # SCRAPING MOUSE OVER LINKS

    # Find the mouse over link pointing to the page with additional info on court decision
    #links = [a['href'] for a in soup.find_all('a', href=re.compile("/utiac/(.*?)"))]
    #links = list(set(links))
    #print("Number of links:",len(links))
    #print(links)
    
    # SCRAPING DATES

    # Find by class the dates and store in list
    datesList = list(soup.find_all("td", class_="date"))
    # Convert to string the list elements 
    datesList = [str(i) for i in datesList]
    # Slice the part of string including data (date format yyyy-mm-dd)
    datesList = [i[33:43] for i in datesList]
    #print(datesList)
    #print("Number of dates %s",len(datesList))

    # Assign data to tuples: # get the list of tuples from two lists and merge them by using zip(). 
    list_of_tuples = list(zip(linksList, datesList))
    
    return list_of_tuples

Create a function to obtain the data from the court's decision detailed page using beautifulSoup.

In [4]:
def getDetailedData(htmlSource):
    """
    getDetailedData gets all the linked tribunal decisions data

    :param htmlSource: Source HTML for the page 
    :return: data
    """
    
    # START WITH URL
    
    return df  

Now that we have defined all the necessary functions, we can open a browser and start scraping.

In [5]:
# Using selenium, open the tribunal decision's website in a firefox window
driver = webdriver.Firefox()
driver.get("https://tribunalsdecisions.service.gov.uk/")

# Getting current URL source code 
get_title = driver.title 
  
# Printing the title of this URL 
print(get_title) 
assert "Tribunal decisions" in driver.title

# Getting current URL source code 
get_source = driver.page_source
time.sleep(2)

print(get_source)

# 1. GET DATES
# 2. Build function that gets data per page
# 3. Loop over 1667 pages and append the lists
# 4. Build dataframe or iterate over each list to 1) open url 2) extract additional content, and 3) download file using class doc-file



Tribunal decisions
<html><head>
<title>Tribunal decisions</title>
<meta content="width=device-width, initial-scale=1.0" name="viewport">
<link href="/assets/favicon-5aa00a9f9e6dc48d4e92f306a94e9496.ico" rel="shortcut icon" type="image/vnd.microsoft.icon">
<!-- For third-generation iPad with high-resolution Retina display: -->
<link href="/assets/apple-touch-icon-144x144.png" rel="apple-touch-icon-precomposed" sizes="144x144">
<!-- For iPhone with high-resolution Retina display: -->
<link href="/assets/apple-touch-icon-114x114.png" rel="apple-touch-icon-precomposed" sizes="114x114">
<!-- For first- and second-generation iPad: -->
<link href="/assets/apple-touch-icon-72x72.png" rel="apple-touch-icon-precomposed" sizes="72x72">
<!-- For non-Retina iPhone, iPod Touch, and Android 2.1+ devices: -->
<link href="/assets/apple-touch-icon-57x57.png" rel="apple-touch-icon-precomposed">
<script async="" src="//www.google-analytics.com/analytics.js"></script><script>
  (function(){if(navigator.use

  Obtain a dataframe with the link and the date of each of the tribunal decisions.

In [6]:

# Scrap current page data and browse to next page
wait = WebDriverWait(driver, 10)

a = []

#while True:
i=0
while i<4:
    # Getting current URL source code 
    get_source = driver.page_source
    # Scrap the data
    b = getData(get_source)
    # Append list data b to list data a
    a += b
    i+=1
    
    # Click on next page
    try:
        element = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'next_page')))
        element.click()
        
    except TimeoutException:
        break
        
# Convert the list of tuples into pandas Dataframe
df = pd.DataFrame(a, columns = ['Link', 'Date'])
    
df

Unnamed: 0,Link,Date
0,jr-01652-2020,2021-09-02
1,pa-02841-2020,2021-08-26
2,dc-00058-2020,2021-08-26
3,pa-03026-2019-d6bc81a0-1d79-46b6-a248-07ad0d28...,2021-08-25
4,pa-01403-2020,2021-08-25
...,...,...
114,hu-00794-2020,2021-08-03
115,ea-03919-2019,2021-08-03
116,ea-02814-2019,2021-08-03
117,dc-00010-2020,2021-08-03


Iterate over all the links in the dataframe. For each link:
1.- Check a 200 reply from web
2.- Scrap detailed data
3.- Download file

In [None]:

# Loop

# Loop through all the links and launch one by one
for link in links:
    browser.get(link)
    # Scrap here
    sleep(3)
    
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import *

wait = WebDriverWait(browser, 10, poll_frequency=1, ignored_exceptions=[StaleElementReferenceException])
element = wait.until(EC.element_to_be_clickable((By.XPATH, "xPath that you want to click")))

In [83]:
# CAL FER EL PARSING DE CADA PAGINA DETALLADA (no es pot fer en json) Trobar amb regex les dades que interessen i crear el json
# i descarregar arxius.

url = "https://tribunalsdecisions.service.gov.uk/utiac/pa-11571-2019"
#PARAMS = {'Publication date:': 2021-08-17, 'sape': 4139 }

PARAMS = {
    "Publication date:": 2021-11-17
}
response = requests.get(url = url, params = PARAMS)


print(response)
response.headers

# if response status code is 200 OK, then
import urllib.request
import wget

if response.status_code == 200:
    # load the json data
    data = response.text
    soup = BeautifulSoup(data, 'html')
    
    lin = re.findall(r'(<a class="doc-file" href="https://moj-tribunals-documents-prod.s3.amazonaws.com/decision/doc_file/(.*?)")', data)
    # result is list of tuples
    doc = "https://moj-tribunals-documents-prod.s3.amazonaws.com/decision/doc_file/" + lin[0][1]
    print(lin[0][1])
    print(doc)
    # Download doc file
    filename = wget.download(doc, '/Users/albertamurgopacheco/Documents/GitHub/TFM/data/raw')
    
    # write to file
    #file.write(r.content)
        
    print(r.headers.get('content-type'))

    

<Response [200]>
73188/PA115712019.doc
https://moj-tribunals-documents-prod.s3.amazonaws.com/decision/doc_file/73188/PA115712019.doc
application/msword


In [65]:
# Click dropdown menu for language selection
driver.find_element_by_xpath("/html/body/div[1]/div[1]/nav/div[4]").click()

# Select language EN
driver.find_element_by_xpath("/html/body/div[1]/div[1]/nav/div[4]/div/div[1]").click()

NoSuchWindowException: Message: Browsing context has been discarded


In [None]:
# Click on PERIOD time selection "START"
driver.find_element_by_xpath("/html/body/div[1]/div[3]/div/div/div[1]/div[2]/div/div[2]/div/div[1]/div/div[1]/div[1]/input").click()

# Click on month selection
driver.find_element_by_xpath("/html/body/div[1]/div[3]/div/div/div[1]/div[2]/div/div[2]/div/div[1]/div/div[1]/div[2]/header/span[2]").click()

# Click on January
driver.find_element_by_xpath("/html/body/div[1]/div[3]/div/div/div[1]/div[2]/div/div[2]/div/div[1]/div/div[1]/div[3]/span[1]").click()

# Click on 1st
driver.find_element_by_xpath("/html/body/div[1]/div[3]/div/div/div[1]/div[2]/div/div[2]/div/div[1]/div/div[1]/div[2]/div/span[13]").click()


In [None]:
# Click on PERIOD time selection "END"
driver.find_element_by_xpath("/html/body/div[1]/div[3]/div/div/div[1]/div[2]/div/div[2]/div/div[1]/div/div[4]/div[1]/input").click()

# Click on month selection
driver.find_element_by_xpath("/html/body/div[1]/div[3]/div/div/div[1]/div[2]/div/div[2]/div/div[1]/div/div[4]/div[2]/header/span[2]").click()

# Click on year
driver.find_element_by_xpath("/html/body/div[1]/div[3]/div/div/div[1]/div[2]/div/div[2]/div/div[1]/div/div[4]/div[3]/header/span[2]").click()

# Click on 2021
driver.find_element_by_xpath("/html/body/div[1]/div[3]/div/div/div[1]/div[2]/div/div[2]/div/div[1]/div/div[4]/div[4]/span[2]").click()

# Click on December
driver.find_element_by_xpath("/html/body/div[1]/div[3]/div/div/div[1]/div[2]/div/div[2]/div/div[1]/div/div[4]/div[3]/span[12]").click()


# Click on 31st
driver.find_element_by_xpath("/html/body/div[1]/div[3]/div/div/div[1]/div[2]/div/div[2]/div/div[1]/div/div[4]/div[2]/div/span[41]").click()


In [None]:
# Click on More Races to get the full list on the screen & Wait for Visibility of Races
# CALDRIA IMPLEMENTAR UN "CHECK ON WHETHER BUTTON EXISTS" PER EVITAR L'ERROR...
for i in range(43):
    driver.find_element_by_xpath("/html/body/div[1]/div[3]/div/div/div[2]/main/div/button").click()
    time.sleep(5)

In [None]:
# Getting current URL source code 
get_source = driver.page_source
time.sleep(2)

In [None]:
# Scraping race names with BeautifulSoup
soup = BeautifulSoup(get_source, 'html')
#print(soup.h5)
#soup.find_all('h5')

In [None]:
# Find the race names
racesList = re.findall(r'(?<=<h5 data-v-f3c4ac1c="" class="itra-green">)(.*?)(?=</h5>)', get_source)
print(len(racesList))
print(racesList)

In [None]:
# Find the mouse over link pointing to the race site
links = [a['href'] for a in soup.find_all('a',"card ontop", href=True)]
print(len(links))
print(links)

In [None]:
# Scraping the data for distance, elevation gain and loss
myList = re.findall(r'(?<=<span class="icon-text-grey icon-bold">)(.*?)(?=</span>)', get_source)
#print(myList)
#len(myList)

# Find the race distance
distancesList = myList[0::3]
print(len(distancesList))

# Find the race elevation gain
gainList = myList[1::3]
print(len(gainList))

# Find the race elevation loss
lossList = myList[2::3]
print(len(lossList))

# Find the race date
datesList = re.findall(r'(?<=<span data-v-f3c4ac1c="" class="itra-grey" style="margin-top: 0.2rem; margin-left: 0.2rem; margin-right: 2rem; font-size: 80%;">)(.*?)(?=</span>)', get_source)
#print(datesList)
print(len(datesList))

# Loop
# Visit race page in Itra
# Scraping www, place, topology, number of participants

In [None]:
# Find the number of finishers of the race
#<span class="icon-finisher icon-bold">370</span>
finishersList = re.findall(r'(?<=<span class="icon-finisher icon-bold">)(.*?)(?=</span>)', get_source)
print(finishersList)
len(finishersList)

In [None]:
# Assign data to tuples: # get the list of tuples from two lists and merge them by using zip(). 
list_of_tuples = list(zip(racesList, links, distancesList, gainList, lossList, datesList)) 
# Converting lists of tuples into pandas Dataframe. 
df = pd.DataFrame(list_of_tuples, columns = ['Name', 'Link', 'Distance', 'Gain', 'Loss', 'Date'])
df



In [None]:
# CAL DESAR LES DADES EN UN FORMAT ADIENT: JSON? CSV directament ja que l'enunciat de la PRACTICA requereix CSV

# Convert results to json and save the file
with open('result.json', 'w') as fp:
    json.dump(sample, fp)

In [None]:





print(iList)
print(iiList)
print(iiList_of_tuples)

In [None]:
# Ara ataquem les dades de la pàgina específica de cada cursa


# List with race websites
websitesList = []

# List with race place and country
locationList = []

# List containing the table for each website (as a list)
tableList = []


#slicedLinks = links[:5]

for i in tqdm(links):
    print(i)
    # Using selenium, open firefox window with the ITRA website
    driver2 = webdriver.Firefox()
    #driver2.get("https://itra.run/race/13893")
    driver2.get(i)
    
    
    try:
        time.sleep(3)   

        # Click dropdown menu for language selection
        driver2.find_element_by_xpath("/html/body/div[1]/div[1]/nav/div[4]").click()

        # Select language EN
        driver2.find_element_by_xpath("/html/body/div[1]/div[1]/nav/div[4]/div/div[1]").click()

        time.sleep(15)
        
        # if link is broken, go back
    except TimeoutException:
        driver2.back()
        print("Time out exception.")
        # continue so we can return to beginning of loop
        continue

    # if you reach this point, the link is valid, and you can 'do stuff' on the page
    
    # Getting current URL source code 
    get_source2 = driver2.page_source
    time.sleep(2)   
    driver2.close()

    # Scraping race names with BeautifulSoup
    soup2 = BeautifulSoup(get_source2, 'html')

    # List containing the race website (when available) and "facebook", "twitter" and other info
    hrefList = [a['href'] for a in soup2.find_all('a', {'rel': "ugc"}, href=True)]

    # Remove the links that contain "facebook" or "twitter" or "@"
    hrefList[:] = [x for x in hrefList if "facebook" not in x]
    hrefList[:] = [x for x in hrefList if "twitter" not in x]
    hrefList[:] = [x for x in hrefList if "@" not in x]

    print(len(hrefList))
    print(hrefList) 
    websitesList.append(hrefList)
    
    # Srape the table with additional data: data labels (first) and content
    labelsList  = re.findall(r'(?<=<div class="colinforace1">)(.*?)(?=</div>)', get_source2)
    contentList = re.findall(r'(?<=<div class="colinforace2 mbb">)(.*?)(?=</div>)', get_source2)

    labels_content_list_of_tuples = list(zip(labelsList[:13], contentList[:13])) 
    print(labels_content_list_of_tuples)
    tableList.append(labels_content_list_of_tuples)
    
    # Scrape the location
    location = soup2.find('p').getText()
    print(location)
    locationList.append(location)
    
    # Check scraped data is saved in a manner that can be directly matched
    print(len(websitesList))
    print(len(locationList))
    print(len(tableList))
    
print(websitesList)

In [None]:
from urllib.request import urlopen

with urlopen("https://itra.run/robots.txt") as stream:
    print(stream.read().decode("utf-8"))

In [None]:
print(stream)