# Web scraping to get some sentencing info
Using BeautifulSoup web scraping to get some details from https://www.sentencingcouncil.org.uk

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
def get_soup(url):
    """Return Beatiful Soup object from web page"""
    response = requests.get(url)
    return BeautifulSoup(response.content, "html.parser")

def get_sentence_item(soup, identifier, tag = "div", index=-1):
    """Extract text from particular item in supplied soup object."""
    # Get text from page
    item = soup.find(tag, identifier).contents[index]
    # Remove: (1) leading/trailing spaces (2) tab chars 
    item = item.strip().replace("\t", "")
    return item

In [3]:
base_url = "https://www.sentencingcouncil.org.uk"

## Get Offence Descriptions and URLs
Can pre-populate offences with URLs before iterating over each one to extract details from associated pages.

In [4]:
page_soup = get_soup(base_url + "/offences/")
list_of_offence_links = page_soup.find("ul", {"class": "offences-filter-list"}).findAll("a")
offences = {link.string.strip(): {"path":link.get('href')} for link in list_of_offence_links}

#### Get some details for each offence extracted above

In [6]:
for description, detail in offences.items():
    print(description, " => ", detail.get("path"))
    page_url = base_url + detail.get("path")
    page_soup = get_soup(page_url)
    
    # Extract some details from each page
    ## act = page_soup.find("div", {"class": "offence-act"}).contents[-1]
    
    detail["act"] = get_sentence_item(page_soup, {"class": "offence-act"})
    detail["date"] = get_sentence_item(page_soup, {"class": "offence-effective-date"})
    
    # First paragraph on page has useful info
    info = page_soup.find("p").contents
    # Extract the text, avoiding any tags
    info_text = ", ".join([item for item in info if not str(item).startswith("<")])
    detail["info_text"] = info_text
    
    # Limit scope of run
    if description.startswith("B"):
        break

Abstracting electricity  =>  /offences/magistrates-court/item/abstracting-electricity
Abuse of position of trust: causing a child to watch a sexual act  =>  /offences/magistrates-court/item/abuse-of-position-of-trust-causing-a-child-to-watch-a-sexual-act
Abuse of position of trust: causing or inciting a child to engage in sexual activity  =>  /offences/magistrates-court/item/abuse-of-position-of-trust-causing-or-inciting-a-child-to-engage-in-sexual-activity
Abuse of position of trust: sexual activity in the presence of a child/ Abuse of position of trust: causing a child to watch a sexual act  =>  /offences/magistrates-court/item/abuse-of-position-of-trust-sexual-activity-in-the-presence-of-a-child
Abuse of position of trust: sexual activity with a child/ Abuse of position of trust: causing or inciting a child to engage in sexual activity  =>  /offences/magistrates-court/item/abuse-of-position-of-trust-sexual-activity-with-a-child
Administering a substance with intent  =>  /offences/ma

In [7]:
detail["info_text"]

', Social Security Administration Act 1992 (section 111A), , Tax Credits Act 2002 (section 35), , Theft Act 1968 (section 17), Triable either way, Maximum: 7 years’ custody, Offence range: Discharge – 6 years 6 months’ custody'

In [6]:
offences.get('Abstracting electricity')
offences.get('Benefit Fraud')

{'path': '/offences/magistrates-court/item/benefit-fraud',
 'act': 'Common law, Fraud Act 2006, s.1, Social Security Administration Act 1992, s.111A, Social Security Administration Act 1992, s.112, Tax Credits Act 2002, s.35, Theft Act 1968, s.17',
 'date': '1 October 2014',
 'info_text': ', Social Security Administration Act 1992 (section 111A), , Tax Credits Act 2002 (section 35), , Theft Act 1968 (section 17), Triable either way, Maximum: 7 years’ custody, Offence range: Discharge – 6 years 6 months’ custody'}

#### Using Pandas dataframe to display offence details more neatly

In [7]:
import pandas as pd
pd.set_option('display.max_colwidth', 80)
pd.set_option('display.max_rows', None)
df = pd.DataFrame()
df["Descriptions"] = offences.keys()
df["URLS"] = offences.values()
display(df)

Unnamed: 0,Descriptions,URLS
0,Abstracting electricity,"{'path': '/offences/magistrates-court/item/abstracting-electricity', 'act': ..."
1,Abuse of position of trust: causing a child to watch a sexual act,{'path': '/offences/magistrates-court/item/abuse-of-position-of-trust-causin...
2,Abuse of position of trust: causing or inciting a child to engage in sexual ...,{'path': '/offences/magistrates-court/item/abuse-of-position-of-trust-causin...
3,Abuse of position of trust: sexual activity in the presence of a child/ Abus...,{'path': '/offences/magistrates-court/item/abuse-of-position-of-trust-sexual...
4,Abuse of position of trust: sexual activity with a child/ Abuse of position ...,{'path': '/offences/magistrates-court/item/abuse-of-position-of-trust-sexual...
5,Administering a substance with intent,{'path': '/offences/magistrates-court/item/administering-a-substance-with-in...
6,Affray,"{'path': '/offences/magistrates-court/item/affray', 'act': 'Public Order Act..."
7,Alcohol sale offences (Revised 2017),{'path': '/offences/magistrates-court/item/alcohol-sale-offences-revised-201...
8,Animal cruelty (Revised 2017),"{'path': '/offences/magistrates-court/item/animal-cruelty-revised-2017', 'ac..."
9,Arranging or facilitating sexual exploitation of a child,{'path': '/offences/magistrates-court/item/arranging-or-facilitating-sexual-...
