# Scraping UK asylum data (25 October 2021)

This notebook scraps the tribunal decision's data related to asylum applications from https://tribunalsdecisions.service.gov.uk/

The script mac_env.sh should be run from the terminal to install the necessary libraries and to activate the environment.

In [1]:
import requests
from bs4 import BeautifulSoup
import time
import re
import json
import pickle
import pandas as pd
import whois
import sys
import datetime
from tqdm import tqdm
from datetime import timedelta
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import *
from selenium.common.exceptions import TimeoutException
import urllib.request
import wget
import concurrent.futures
import tqdm

import sys
IN_COLAB = 'google.colab' in sys.modules


# What environment am I using?
print(f'Current environment: {sys.executable}')

# Change the current working directory
os.chdir('/Users/albertamurgopacheco/Documents/GitHub/TFM')
# What's my working directory?
print(f'Current working directory: {os.getcwd()}')


Current environment: /Users/albertamurgopacheco/anaconda3/envs/tfm/bin/python
Current working directory: /Users/albertamurgopacheco/Documents/GitHub/TFM


In [2]:
# Define working directories in colab and local execution

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')
    docs_path = '/content/gdrive/MyDrive/TFM/data/raw'
    input_path = '/content/gdrive/MyDrive/TFM'
    output_path = '/content/gdrive/MyDrive/TFM/output'

else:
    docs_path = './data/raw'
    input_path = '.'
    output_path = './output'

Function to scrap the data in the UIATC landing page using beautifulSoup.

In [3]:
def getData(htmlSource):
    """
    getData gets the mouse over links to the tribunal decisions & the dates

    :param htmlSource: Source HTML for the page 
    :return: data as a list of tuples
    """
    
    # Scraping tribunal decision names with BeautifulSoup
    soup = BeautifulSoup(htmlSource, 'html')
    
    # SCRAPING MOUSE OVER LINKS
    linksList = re.findall(r'(<a href="/utiac/(.*?)">)', htmlSource)
    linksList = [i[1] for i in linksList]
    linksList = list(set(linksList))
    #print("Number of links:",len(linksList))
    #print(linksList)
    
    # SCRAPING DATES

    # Find by class the dates and store in list
    datesList = list(soup.find_all("td", class_="date"))
    # Convert the list elements to string  
    datesList = [str(i) for i in datesList]
    # Slice the part of string including data (date format yyyy-mm-dd)
    datesList = [i[33:43] for i in datesList]
    #print(datesList)
    #print("Number of dates %s",len(datesList))

    # Assign data to tuples: # get the list of tuples from two lists and merge them by using zip()
    tuplesList = list(zip(linksList, datesList))
    
    return tuplesList

Create a function to obtain the data from the court's decision detailed page using beautifulSoup.     TO DO: 2. Try with different files make sure it works, 3. Capture exceptions and 204 responses. 4. Create function, 5. How am I storing the dicts? In a list? 5. Try function with just a few obs. https://stackoverflow.com/questions/20638006/convert-list-of-dictionaries-to-a-pandas-dataframe



In [4]:
def getDetailedData(url):
    """
    getDetailedData gets the detailed data linked to a tribunal decision
    and saves a doc file in /Users/albertamurgopacheco/Documents/GitHub/TFM/data/raw

    :param url: url (link) to the page containing the detailed info 
    :return: dictionary 
    """
    
    # START WITH URL
    try:
        response = requests.get(url = url)
        
        # if response status code is 200 OK, then
        if response.status_code == 200:
        # load the data
            data = response.text
            soup = BeautifulSoup(data, 'html')
    
            # Scrape the reference number
            refList = list(soup.find_all("h1"))
            # Convert the list elements to string  
            refList = [str(i) for i in refList]
            # Remove leading <h1> and trailing </h1>
            refList = [i.replace('</h1>', '') for i in refList]
            refList = [i.replace('<h1>', '') for i in refList]
            #print(refList)
    
            # Find the link (docLink) to the document
            lnk = re.findall(r'(<a class="doc-file" href="https://moj-tribunals-documents-prod.s3.amazonaws.com/decision/doc_file/(.*?)")', data)
            # Build link using the second element in regex result (list of tuples)
            docLink = "https://moj-tribunals-documents-prod.s3.amazonaws.com/decision/doc_file/" + lnk[0][1]
            # Download files to raw folder
            try:
                filename = wget.download(url = docLink, out = docs_path)
                downloaded = "Yes"
            # Handle download exceptions 
            except Exception as err:
                print("Could not download file {}".format(docLink))
                print(err)
                downloaded = "No"
                pass
     
            # Find detailed information
            res = [item.get_text() for item in soup.select("span")]
            # Remove \xa0 from strings
            res = [elem if '\xa0' not in elem else elem.replace('\xa0', '') for elem in res]
            # Remove trailing and leading spaces and \n
            res = [elem.strip() for elem in res]
            #print(res)

            # Split list of results into two lists (keys & values)
            keysList = res[::2] # Keys: Elements from res starting from 0 iterating by 2
            valuesList = res[1::2] # Values: Elements from res starting from 1 iterating by 2
            #print(keysList)
            #print(valuesList)
    
            # Create dictionary with results (resDict)
            zip_iterator = zip(keysList, valuesList)
            resDict = dict(zip_iterator)
    
            # Add reference number and link to document to the dictionary
            resDict["Document"] = docLink
            resDict["Reference"] = refList
            resDict["Download"] = downloaded
            resDict["File"] = lnk[0][1]
            #print(resDict)
            
        else:
            resDict = {"URL not working:": str(url)}
            print(f"URL not working: {url}")
            
    except requests.exceptions.RequestException as e:  # Capture exceptions
        print (e.response.text)
        raise SystemExit(e)   
    
    return resDict  

Now that we have defined all the necessary functions, we can open a browser and start scraping.

In [5]:
# Using selenium, open the tribunal decision's website in firefox
driver = webdriver.Firefox()
driver.get("https://tribunalsdecisions.service.gov.uk/")

# Getting current URL source code 
get_title = driver.title 
  
# Printing the title of the URL 
print(get_title) 
assert "Tribunal decisions" in driver.title

# Getting current URL source code 
get_source = driver.page_source
time.sleep(2)
#print(get_source)

Tribunal decisions


  Obtain a dataframe with the link and the date of each of the tribunal decisions. 
  1. GET DATES
 2. Build function that gets data per page
 3. Loop over 1667 pages and append the lists
 4. Build dataframe or iterate over each list to 1) open url 2) extract additional content, and 3) download file using class doc-file




In [6]:
# Scrape current page data and browse to next page

# List of tuples to store the results from getData()
a = []

#while True:
i=1
while i<1178:

    # Getting current URL source code 
    get_source = driver.page_source
    # Scrape the data
    b = getData(get_source)
    # Append list data b to list data a
    a += b
    i+=1
    
    # Click on next page
    try:
        delay = 15 # seconds
        #element_present = EC.presence_of_element_located((By.CLASS_NAME, 'next_page'))
        #WebDriverWait(driver, delay).until(element_present)
        
        wait = WebDriverWait(driver, delay)
        element = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'next_page')))
        element.click()
            # wait       
    except TimeoutException:
        print("Loading took too much time!")
        break


In [9]:
# List of links with the decision files
decisionLinks = [tple[0] for tple in a]

# Number of urls to crap detailed data
print(f'Number of urls to scrap detailed data from: {len(decisionLinks)}')

Number of urls to scrap detailed data from: 35286


Now, we scrape the detailed data on each tribunal decission.

In [47]:
# Create a list of urls from links
urls = [ "https://tribunalsdecisions.service.gov.uk/utiac/"+decision for decision in decisionLinks]
# Item 35014 crashes the loop
urls.pop(35014)

# List of dict where each dict contains scraped detailed data
scrapedList = []

# Scrap detailed data from all urls
for url in urls:
    scrapedItem = getDetailedData(url)
    #print(scrapedItem)
    scrapedList.append(scrapedItem)


Could not download file https://moj-tribunals-documents-prod.s3.amazonaws.com/decision/doc_file/51943/AA055522014.doc
HTTP Error 500: Internal Server Error
Could not download file https://moj-tribunals-documents-prod.s3.amazonaws.com/decision/doc_file/40081/IA083642010___IA083692010___IA083752010.DOC
HTTP Error 403: Forbidden


IndexError: list index out of range

In [62]:
# Attempted download item urls[35014] crashes the loop
# scraping remaining items

# Scrap detailed data from all urls
for url in urls[-245:]:
    scrapedItem = getDetailedData(url)
    #print(scrapedItem)
    scrapedList.append(scrapedItem)

In [79]:
# Number of scraped court decisions
print(f'A total of {len(scrapedList)} elements have been scraped')

# Number of documents scraped



A total of 35257 elements have been scraped


'https://tribunalsdecisions.service.gov.uk/utiac/2003-ukiat-7478'

Could not download file https://moj-tribunals-documents-prod.s3.amazonaws.com/decision/doc_file/51943/AA055522014.doc
HTTP Error 500: Internal Server Error
Could not download file https://moj-tribunals-documents-prod.s3.amazonaws.com/decision/doc_file/40081/IA083642010___IA083692010___IA083752010.DOC
HTTP Error 403: Forbidden

https://tribunalsdecisions.service.gov.uk/utiac/2003-ukiat-7478

urls[35014]

In [22]:
# Save as json
jsonData = json.dumps(scrapedList)
# Save jsonData as jsonFile in working directory
jsonFile = open("jsonData.json", "w")
jsonFile.write(jsonData)
jsonFile.close()

# Open json
parsed = json.loads(jsonData)
print(json.dumps(parsed[30000], indent = 4, sort_keys = True))
print(len(scrapedList))

{
    "Appellant name:": "",
    "Case title:": "",
    "Country:": "",
    "Document": "https://moj-tribunals-documents-prod.s3.amazonaws.com/decision/doc_file/42595/OA140542013.doc",
    "Download": "Yes",
    "Hearing date:": "24 Jul 2014",
    "Judges:": "",
    "Last updated on:": "5 Dec 2014",
    "Promulgation date:": "12 Aug 2014",
    "Publication date:": "5 Dec 2014",
    "Reference": [
        "OA/14054/2013"
    ],
    "Status of case:": "Unreported"
}
35004


In [23]:
# Save as pickle
with open('pickleData.pkl', 'wb') as f:
    pickle.dump(scrapedList, f, protocol = pickle.HIGHEST_PROTOCOL)

# Open pickle file
with open('pickleData.pkl', 'rb') as f:
    d = pickle.load(f)


In [10]:
import json
# Open jsonData file
jsonData_path = os.path.join(os.getcwd(), 'jsonData.json')
with open(jsonData_path) as json_file:
    data = json.load(json_file)
print(data[0])

{'Case title:': '', 'Appellant name:': '', 'Status of case:': 'Unreported', 'Hearing date:': '10 Aug 2021', 'Promulgation date:': '17 Sep 2021', 'Publication date:': '4 Oct 2021', 'Last updated on:': '4 Oct 2021', 'Country:': '', 'Judges:': '', 'Document': 'https://moj-tribunals-documents-prod.s3.amazonaws.com/decision/doc_file/73573/PA027032020.doc', 'Reference': ['PA/02703/2020'], 'Download': 'Yes'}
