In [154]:
# Imports

import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime

In [162]:

print(datetime.today())

2021-04-14 21:31:46.480494


### Let's scrape State Court data (past 3 months)!

In [46]:
supreme_court = Court('supreme')

In [47]:
supreme_court.pull_urls()

<a id='step1'></a>
### 1) Fetch the State Court content by URL.


In [50]:
# Target State Court page:
url = "https://www.lawnet.sg/lawnet/web/lawnet/free-resources?p_p_id=freeresources_WAR_lawnet3baseportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_pos=2&p_p_col_count=3&_freeresources_WAR_lawnet3baseportlet_action=subordinate"

# Establishing the connection to the web page:
response = requests.get(url)

# You can use status codes to understand how the target server responds to your request.
# Ex., 200 = OK, 400 = Bad Request, 403 = Forbidden, 404 = Not Found.
print(response.status_code)

# Pull the HTML string out of requests and convert it to a Python string.
html = response.text

200


<a id='step2'></a>
### 2) Parse the HTML document with Beautiful Soup.

This step allows us to access the elements of the document by XPath expressions.

In [51]:
state_court = BeautifulSoup(html, 'lxml')

In [54]:
court_full = pd.read_csv(f'../data/statecourt_compiled.csv')
last_entry = court_full.title[0]
last_date = court_full.date[0]
last_date

'05 Apr 2021'

In [151]:
# This code collects the date, name, and links for judgments from the High Court 
# which are accessible on available on LawNet

# List to store results
results_list = []

# Create variable for the domain
domain = "https://www.lawnet.sg/lawnet/web/lawnet/free-resources?p_p_id=freeresources_WAR_lawnet3baseportlet&p_p_lifecycle=1&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_pos=2&p_p_col_count=3&_freeresources_WAR_lawnet3baseportlet_action=openContentPage&_freeresources_WAR_lawnet3baseportlet_docId="

# Get number of pages
# Remove all extra strings to get the number
page = str(state_court.find_all('li', {'class': 'lastPageActive'}))
page = ''.join(filter(str.isdigit, page))
# Replace the first "3" as it comes from the url
last_page = int(page.replace("3","",1))

# Create counter for current page
current_page = 1
# Loop while it is not the last page
while current_page <= last_page:
    url1 = url+"&_freeresources_WAR_lawnet3baseportlet_page="+str(current_page)
    # Establishing the connection to the web page:
    response1 = requests.get(url1)
    # Pull the HTML string out of requests and convert it to a Python string.
    html1 = response1.text
    state_court1 = BeautifulSoup(html1, 'lxml')
    # Get the relevant elements (date, name, link)
    search_results = state_court1.find_all('ul', {'class': 'searchResultsHolder'})
    for li in search_results:
        li_list = li.find_all('li')
        li_list.pop(0)
        for element in li_list:
            # start a dictionary to store this item's data
            result = {}
            # get the date
            date = element.find('p', {'class': 'resultsDate'}).text
            result['date'] = date
            # get the title and full link/url
            a_href = element.find('a')
            text = a_href.text
            if text == last_entry and date == last_date:
                
            # only store "full" rows of data
            elif a_href:
                result['title'] = a_href.text.strip()   # element text
                link = str(a_href['href']) # href link
                link = link.replace("javascript:viewContent","")
                link = link.strip("')(")
                result['link'] = domain+link
            if len(result) == 3:
                results_list.append(result)

               
    # Raise page counter
    current_page += 1
state_court_df = pd.DataFrame(results_list)


In [12]:
# Narrow to only Criminal cases
state_court_df = state_court_df[state_court_df['title'].str.contains("Public Prosecutor")]


<a id='step1'></a>
### 1) Fetch the Supreme Court content by URL.



In [13]:
# Target Supreme Court page:
url = "https://www.lawnet.sg/lawnet/web/lawnet/free-resources?p_p_id=freeresources_WAR_lawnet3baseportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_pos=2&p_p_col_count=3&_freeresources_WAR_lawnet3baseportlet_action=supreme"

# Establishing the connection to the web page:
response = requests.get(url)

# You can use status codes to understand how the target server responds to your request.
# Ex., 200 = OK, 400 = Bad Request, 403 = Forbidden, 404 = Not Found.
print(response.status_code)

# Pull the HTML string out of requests and convert it to a Python string.
html = response.text

200


<a id='step2'></a>
### 2) Parse the HTML document with Beautiful Soup.

This step allows us to access the elements of the document by XPath expressions.

In [14]:
supreme_court = BeautifulSoup(html, 'lxml')

In [15]:
# This code collects the date, name, and links for judgments from the High Court 
# which are accessible on available on LawNet

# List to store results
results_list = []

# Create variable for the domain
domain = "https://www.lawnet.sg/lawnet/web/lawnet/free-resources?p_p_id=freeresources_WAR_lawnet3baseportlet&p_p_lifecycle=1&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_pos=2&p_p_col_count=3&_freeresources_WAR_lawnet3baseportlet_action=openContentPage&_freeresources_WAR_lawnet3baseportlet_docId="

# Get number of pages
# Remove all extra strings to get the number
page = str(supreme_court.find_all('li', {'class': 'lastPageActive'}))
page = ''.join(filter(str.isdigit, page))
# Replace the first "3" as it comes from the url
last_page = int(page.replace("3","",1))

# Create counter for current page
current_page = 1
# Loop while it is not the last page
while current_page <= last_page:
    url1 = url+"&_freeresources_WAR_lawnet3baseportlet_page="+str(current_page)
    # Establishing the connection to the web page:
    response1 = requests.get(url1)
    # Pull the HTML string out of requests and convert it to a Python string.
    html1 = response1.text
    supreme_court1 = BeautifulSoup(html1, 'lxml')
    # Get the relevant elements (date, name, link)
    search_results = supreme_court1.find_all('ul', {'class': 'searchResultsHolder'})
    for li in search_results:
        li_list = li.find_all('li')
        for element in li_list:
            # start a dictionary to store this item's data
            result = {}
            # get the date
            result['date'] = element.find('p', {'class': 'resultsDate'}).text
            # get the title and full link/url
            a_href = element.find('a')
            if a_href:
                result['title'] = a_href.text   # element text
                link = str(a_href['href']) # href link
                link = link.replace("javascript:viewContent","")
                link = link.strip("')(")
                result['link'] = domain+link
            # only store "full" rows of data
                if len(result) == 3:
                    results_list.append(result)
    # Raise page counter
    current_page += 1
supreme_court_df = pd.DataFrame(results_list)

In [16]:
# Narrow to only Criminal cases
supreme_court_df = supreme_court_df[supreme_court_df['title'].str.contains("Public Prosecutor")]

In [17]:
# Load compiled databases
state_court_full = pd.read_csv('../data/statecourt_compiled.csv')
supreme_court_full = pd.read_csv('../data/supremecourt_compiled.csv')


In [18]:
# Compare with compiled databases and only save new entries
state_court_df = state_court_df[~state_court_df['link'].isin(state_court_full['link'])]
supreme_court_df = supreme_court_df[~supreme_court_df['link'].isin(supreme_court_full['link'])]

state_court_full = state_court_df.merge(state_court_full, how='outer')
supreme_court_full = supreme_court_df.merge(supreme_court_full, how='outer')

In [13]:
# Export to .csv
state_court_df.to_csv(path_or_buf=f'../data/statecourt.csv', index=False)
supreme_court_df.to_csv(path_or_buf=f'../data/supremecourt.csv', index=False)

state_court_full.to_csv(path_or_buf=f'../data/statecourt_compiled.csv', index=False)
supreme_court_full.to_csv(path_or_buf=f'../data/supremecourt_compiled.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: '../data/statecourt_2021-04-13.csv'

In [12]:
state_court_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48 entries, 1 to 71
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    48 non-null     object
 1   title   48 non-null     object
 2   link    48 non-null     object
dtypes: object(3)
memory usage: 1.5+ KB


---
<a id='follow-links'></a>
### Following Links for More Results

One hundred results is pretty good, but what if we want more? We need to follow the "next" links and find new pages to grab. Using the **`parse()`** method of our spider class, we need to return another type of object.

See [Stack Overflow](https://stackoverflow.com/questions/30152261/make-scrapy-follow-links-and-collect-data) for details!
