In [1]:
import re, os, glob, csv, math, requests, time, sys, random, json, datetime, urllib#, dryscrape
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from sklearn.utils import shuffle      
import numpy as np
from nordvpn_switcher import initialize_VPN,rotate_VPN,terminate_VPN
from selenium import webdriver 
from webdriver_manager.chrome import ChromeDriverManager
# https://github.com/canlii/API_documentation/blob/master/EN.md

""" custom functions:"""
from webscraping_kit import write_htmlfile, write_json_tofile, write_driverhtmlfile
from webscraping_kit import read_jsoncsv, read_htmlfile, read_htmlresponse
from webscraping_kit import build_queries, query_canlii_api, save_canlii_response, parse_caseid, parse_canlii_responses, format_canlii_caseurls
from webscraping_kit import parse_tabledata, get_tablerescount, get_tabledata
from webscraping_kit import rmnl, get_apikeys, get_origin
""" 
------------------------------------------------------------------------------------------------------------------
webscraping functions: parse_tabledata(soups,casefolder), get_tablerescount(soup), get_tabledata(tablelink,tablefile)
------------------------------------------------------------------------------------------------------------------
canlii api functions: build_queries(canlii_key,start,end), query_canlii_api(queries), save_canlii_response(canliiapiresponse,file), 
                      parse_caseid(bscanliiapiresponse), parse_canlii_responses(mainpath), format_canlii_caseurls(row)
------------------------------------------------------------------------------------------------------------------
other functions: get_origin(row,tablecids)
"""     

mainpath = str(os.getcwd())+'/'
tablefolder = mainpath + 'data/tables/'
casefolder = mainpath + 'data/cases/'
canlii_key = get_apikeys(mainpath + 'apikeys.txt','canlii_key')
headers = get_apikeys(mainpath + 'apikeys.txt','headers')

<h2> 1.  create an index of cases to collect (2 ways) </h2>

<h3> a) webscraping CanLII </h3>
- create the index by webscraping canlii for cases from the ONLTB by year <br>

<h4> i. get total count of cases for tribunal and note date </h4>

In [2]:
""" get total count of canlii cases """
""" - link to table of legislation, courts, boards and tribunals, containing case count information """
canliilaw_casecountfile = mainpath + 'data/1_canliicasecount.html'
canliilawlink = "https://www.canlii.org/en/on/"

""" save the table to a file """
# canliilawresponse = requests.get(canliilawlink,headers=headers)
# write_htmlfile(canliilawresponse,canliilaw_casecountfile)
# print(canliilawresponse)
# canliilawsoup = bs(canliilawresponse.text,'html.parser')
canliilawsoup = read_htmlfile(canliilaw_casecountfile)

canliionltb_casecount = [j for j in [i.strip() for i in [x for x in canliilawsoup.find_all('div',{'class':'row row-stripped py-1 ml-0 tribunalRow'}) if 'Landlord and Tenant Board' in x.text][0].text.split("\n")] if j]
print(canliionltb_casecount) #['ONLTB', 'Landlord and Tenant Board', '2009 -', '2022-01-06', '36,437']
print(datetime.datetime.now())

# ['ONLTB', 'Landlord and Tenant Board', '2009 -', '2022-01-06', '36,437']
# 2022-01-07 15:49:47.217678

['ONLTB', 'Landlord and Tenant Board', '2009 -', '2022-01-06', '36,437']
2022-01-08 09:52:38.914691


<h4> ii. save yearly tables of links to each case </h4>

In [3]:
tablelinkprefix = "https://www.canlii.org/en/on/onltb/nav/date/"
tablelinkyears =[tablelinkprefix + str(x) + '/' for x in list(range(2006,2022))]
tdf = pd.DataFrame(tablelinkyears,columns=['tablelink'])
tdf['year'] = tdf['tablelink'].apply(lambda x: int(x.split('/')[-2]))
tdf['tablefile'] = tdf['year'].apply(lambda x: tablefolder + str(x) + '.html')
tabledict = dict(zip(tdf['tablelink'],tdf['tablefile']))
print(len(tabledict))

16


In [None]:
settings = initialize_VPN(save=0,area_input=['complete rotation'],skip_settings=1)

# rotate_VPN(settings,google_check=1)
rotate_VPN(settings)

for j,(k,v) in enumerate(tabledict.items()):
    get_tabledata(k,v)
    print('sleeping for 10 seconds...')
    time.sleep(10)
    print(j)
    print('resuming...')

# was missing 2021
# rotate_VPN(settings)

# get_tabledata('https://www.canlii.org/en/on/onltb/nav/date/2021/','/home/bunds/bundslab/ltb_env/ltb/data/tables/2021.html')
terminate_VPN(settings)

In [4]:
tablefiles = glob.glob(tablefolder + '*.html')

soups = []
for i,f in enumerate(tablefiles):
    soups.append([f,read_htmlfile(f)])

tydf = parse_tabledata(soups,casefolder)
tydf['table_result_count'] = tydf['table_result_count'].astype(int)

print('There should be '+ str(re.sub(',','',canliionltb_casecount[-1]).strip()) + ' cases.')
print('The sum of all result counts: '+ str(tydf[['tablefile','table_result_count']].drop_duplicates()['table_result_count'].sum()))
print('There are '+ str(len(list(set(tydf['cid'].tolist())))) + ' cases.')

tydf['tableurl'] = tydf['tablefile'].map(dict(zip(tdf['tablefile'],tdf['tablelink'])))
tydf = tydf[['cid','fileno','year','tablefile','tableurl','table_result_count','caseoutfile','caseurl']]
tydf.to_csv(mainpath + 'output/1_tablecaseidx.csv',sep='\t',index=False,quoting=csv.QUOTE_ALL)

There should be 36437 cases.
The sum of all result counts: 36435
There are 36435 cases.


<h3> b) using the CanLII API </h3>
- create the index and list of individual case urls from canlii <br>

In [5]:
queries = build_queries(canlii_key,2006,2021)
print(len(queries))
print(queries[-1])

apioutfile = mainpath + 'data/1_apicaseidx.csv'
# query_canlii_api(queries,apioutfile,headers)

16
https://api.canlii.org/v1/caseBrowse/en/onltb/?offset=0&resultCount=10000&decisionDateBefore=2022-01-01&decisionDateAfter=2020-12-31&api_key=CANLII_API_KEY

In [6]:
tapidf = parse_canlii_responses(apioutfile,casefolder)
print(tapidf.shape)

(33205, 6)


In [7]:
tablecids = list(set(tydf['cid'].tolist()))
print(len(tablecids))
apicids = list(set(tapidf['cid'].tolist()))
print(len(apicids))
print("There are "+str(len(list(set(tablecids)-set(apicids))))+" cases collected by webscraping, not available through the canlii api.")
print("There are "+str(len(list(set(apicids)-set(tablecids))))+" cases collected by api, not available through webscraping.")

36435
33205
There are 3295 cases collected by webscraping, not available through the canlii api.
There are 65 cases collected by api, not available through webscraping.


In [8]:
tapidf.to_csv(mainpath + 'output/1_apicaseidx.csv',sep='\t',index=False,quoting=csv.QUOTE_ALL)

<h2> 2. merge results from both collections </h2>

In [9]:
tydf['caseurl'] = tydf['caseurl'].apply(lambda x: "https://www.canlii.org/en/on/onltb/doc/" + x.split('/doc/',1)[-1])
tydf = tydf[['cid','year','caseoutfile','caseurl']]
tydf = tydf[~tydf['caseurl'].str.contains('/doc/2005/2005canlii')]
tapidf = tapidf[['cid','year','caseoutfile','caseurl']]
tapidf = tapidf[tapidf['cid']!='2011canlii92502']

df = pd.concat([tydf,tapidf]).drop_duplicates()#,on=['cid','caseurl','year','caseoutfile'],how='outer')
print(df.shape)

print(tapidf.shape)
print(df[df['cid'].isin(tapidf['cid'].tolist())].shape)

print(tydf.shape)
print(df[df['cid'].isin(tydf['cid'].tolist())].shape)

(36499, 4)
(33204, 4)
(33204, 4)
(36434, 4)
(36434, 4)


In [11]:
tablecids = list(set(tydf['cid'].tolist()))
apicids = list(set(tapidf['cid'].tolist()))
tablematches = [x for x in tablecids if x in apicids]

df['source'] = np.where(df['cid'].isin(tablematches),'both','one')

df['source'] = df.apply(lambda x: get_origin(x,tablecids),axis = 1)
print(df.shape)
print(df['source'].value_counts())
"""(36499, 5)
both         33139
webscrape     3295
api             65"""

(36499, 5)
both         33139
webscrape     3295
api             65
Name: source, dtype: int64


In [12]:
df.to_csv(mainpath + 'output/1_caseidx.csv',sep='\t',index=False,quoting=csv.QUOTE_ALL)