In [1]:
import re, os, glob, csv, math, requests, time, sys, random, json, datetime, urllib, nltk
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from sklearn.utils import shuffle      
import numpy as np
from nordvpn_switcher import initialize_VPN,rotate_VPN,terminate_VPN
import multiprocessing as mp
from tqdm import tqdm
from webscraping_kit import write_htmlfile, write_json_tofile, write_driverhtmlfile, rmnl
from webscraping_kit import read_jsoncsv, read_htmlfile, read_htmlresponse, read_driverresponse, get_cases
from string import digits
from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.util import everygrams
from nltk.tokenize import sent_tokenize
from htmlparsing_kit import *

def update(*a):
    pbar.update()
    
def get_errortype(x):
    if "404 Error" in x:
        tag = 'not_found'
    elif x.startswith('ERROR_Request failed'):
        tag = 'failed_request'
    elif x.startswith('ERROR_Timed out waiting'):
        tag = 'timed_out'
    elif x == 'no_error':
        tag = 'no_error'
    else:
        tag = 'parsing_error'
    return tag

def get_cleanjudloc(texts,ldict,jdict):
    cjllist = []
    for t in texts:
        k = t.split('_',1)[0].strip()
        cjllist.append('|'.join([k+'_'+map_cleanjl(t,ldict),k+'_'+map_cleanjl(t,jdict)]))
    return cjllist

def parse_parallelresults(jlresults):
    jldict = {}
    for jl in jlresults:
        v = [j.split('_',1)[-1].strip() for j in jl.split('|')]
        v = [re.sub('|','',j).strip() for j in v]
        v = '|'.join([j for j in v if j])
        k = jl.split('_',1)[0].strip()
        jldict[k] = v
    return jldict

def preprocess_list(textchunk,stopwordslist,pattern):
    tres = []
    for t in textchunk:
        tres.append(t.split('_',1)[0].strip() + '_' + preprocess(t,stopwordslist,pattern))
    return tres

def process_parallel(chunks,functiontype,ldict,jdict,stopwordslist,pattern,pbar):
    pool = mp.Pool(processes=10)
    # pbar = tqdm(textchunks)
    jlresults = []
    for i in range(pbar.total): 
        if functiontype == "get_cleanjudloc":
            jlresults.append(pool.apply_async(get_cleanjudloc, args=(textchunks[i],ldict,jdict), callback=update))
        else:
            jlresults.append(pool.apply_async(preprocess_list, args=(textchunks[i],stopwordslist,pattern), callback=update))
            continue
    pool.close()
    pool.join()
    return jlresults

def check_membertype(member):
    m = member.lower()
    if 'vice chair' in m:
        m = '_vicechair'
    elif 'vice-chair' in m:
        m = '_vicechair'
    elif 'vice char' in m:
        m ='_vicechair'
    elif 'vice cha' in m:
        m = '_vicechair'
    elif 'member' in m:
        m = '_member'
    elif 'dispute resolution officer' in m:
        m = '_disputeresolutionofficer'
    elif 'hearing officer' in m:
        m = '_hearingofficer'
    elif 'hearings officer' in m:
        m = '_hearingofficer'
    elif 'associate chair' in m:
        m = '_associatechair'
    else:
        m = 'missing'
    return m

mainpath = str(os.getcwd())+'/'
metadataoutfile =  mainpath + 'output/3_metadata.csv'
metadatamissingoutfile =  mainpath + 'output/3_metadata_missing.csv'
df = pd.read_csv(mainpath + 'output/2_casefileidx.csv',sep='\t')
df = df[['cid','year','source','caseurl','goodcasefiles']]

porter = PorterStemmer() 
remove_digits = str.maketrans('', '', digits)
stopwordslist = create_stopwordslist()
pattern = re.compile(r'\b(' + r'|'.join(stopwordslist) + r')\b\s*')
nltk.download('punkt')
nltk.download('wordnet')

goodhtmlfiles = df['goodcasefiles'].tolist()
print(len(goodhtmlfiles))

htmlchunks = list(divide_chunks(goodhtmlfiles,300))
print(len(htmlchunks))
print(len([item for sublist in htmlchunks for item in sublist]))

36499
122
36499


[nltk_data] Downloading package punkt to /home/bunds/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/bunds/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/bunds/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/bunds/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
""" using parallel processing to extract the metadata from the htmls"""
# pool = mp.Pool(processes=10)
# pbar = tqdm(htmlchunks)
    
# for i in range(pbar.total):
#     pool.apply_async(read_htmlfiles, args=(htmlchunks[i],metadataoutfile), callback=update)
    
# pool.close()
# pool.join()

In [2]:
metadicts = read_jsoncsv(metadataoutfile)
print(len(metadicts))
mdf = pd.DataFrame(metadicts)
# mdf['tablefileno'] = mdf['cid'].map(dict(zip(df['cid'],df['fileno'])))
mdf['caseurl'] = mdf['cid'].map(dict(zip(df['cid'],df['caseurl'])))
mdf['file'] = mdf['cid'].map(dict(zip(df['cid'],df['goodcasefiles'])))
mdf.replace(r'^\s*$', np.nan, regex=True,inplace=True)
mdf.fillna('missing',inplace=True)
print("There should be "+ str(df.shape[0])+" cases.")
print("There are "+ str(mdf.shape[0])+" cases.")

df['missingcases'] = np.where(df['cid'].isin(mdf['cid'].tolist()),'present','missing')
print(df['missingcases'].value_counts())

mdf['errtype'] = mdf['error'].apply(lambda x: get_errortype(x))
print(mdf['errtype'].value_counts())
# parsing_error cases are in french ['2020canlii122218','2021canlii114717']

missing = mdf[mdf['errtype'].isin(['timed_out','failed_request'])]
missing = missing.copy()
missing['file'] = missing['cid'].apply(lambda x: mainpath + 'data/errorcases/'+x + '.html')
caselinkdict = dict(zip(missing['file'],missing['caseurl']))
print(len(caselinkdict))

36499
There should be 36499 cases.
There are 36499 cases.
present    36499
Name: missingcases, dtype: int64
no_error          36417
not_found            36
timed_out            32
failed_request       12
parsing_error         2
Name: errtype, dtype: int64
44


In [None]:
# settings = initialize_VPN(save=0,area_input=['complete rotation'],skip_settings=1)

# rotate_VPN(settings,google_check=1)
# get_cases(caselinkdict,settings)
# terminate_VPN(settings)

In [None]:
# read_htmlfiles(list(caselinkdict.keys()),metadatamissingoutfile)

In [3]:
metadictsmissing = read_jsoncsv(metadatamissingoutfile)
print(len(metadictsmissing))
mdfm = pd.DataFrame(metadictsmissing)
# mdf['tablefileno'] = mdf['cid'].map(dict(zip(df['cid'],df['fileno'])))
mdfm['caseurl'] = mdfm['cid'].map(dict(zip(missing['cid'],missing['caseurl'])))
mdfm['file'] = mdfm['cid'].map(dict(zip(missing['cid'],missing['file'])))
mdfm.replace(r'^\s*$', np.nan, regex=True,inplace=True)
mdfm.fillna('missing',inplace=True)
mdfm['errtype'] = mdfm['error'].apply(lambda x: get_errortype(x))
metadf = pd.concat([mdfm,mdf[~mdf['errtype'].isin(['timed_out','failed_request'])]])
print(metadf.shape)
print(metadf['errtype'].value_counts())

44
(36499, 13)
no_error         36461
not_found           36
parsing_error        2
Name: errtype, dtype: int64


In [4]:
metadf['source'] = metadf['cid'].map(dict(zip(df['cid'],df['source'])))
print(metadf['source'].value_counts())
print()
print(metadf[['errtype','source']].value_counts())
# tablecaseidx.csv should be 36,437, -1 for the case from 2005, and -2 for the cases missing from the canlii tables = 36434
# (both) 33139 + (webscrape) 3295 = 36434

both         33139
webscrape     3295
api             65
Name: source, dtype: int64

errtype        source   
no_error       both         33117
               webscrape     3293
               api             51
not_found      both            20
               api             14
               webscrape        2
parsing_error  both             2
dtype: int64


In [5]:
ldict = {'Central-RO': 'Central_Mississauga', '3 Robert Speck Pkwy': 'Central_Mississauga', 'Central Regional Office': 'Central_Mississauga', 'Eastern-RO': 'Eastern_Ottawa', '255 Albert St': 'Eastern_Ottawa', 'Eastern Regional Office': 'Eastern_Ottawa', 'Head Office': 'HeadOffice_Toronto', '777 Bay St': 'HeadOffice_Toronto', 'Northern-RO': 'Northern_Sudbury', '199 Larch St': 'Northern_Sudbury', 'Northern Regional Office': 'Northern_Sudbury', 'South West-RO': 'SouthWest_London', '150 Dufferin Ave': 'SouthWest_London', 'South West Regional Office': 'SouthWest_London', 'Southern-RO': 'Southern_Hamilton', 'Southern Regional Office': 'Southern_Hamilton', '119 King St': 'Southern_Hamilton', 'Toronto East-RO': 'TOEast', 'Toronto East Regional Office': 'TOEast', '2275 Midland Ave': 'TOEast', 'Toronto North-RO': 'TONorth', 'Toronto North Regional Office': 'TONorth', '47 Sheppard Ave': 'TONorth', 'Toronto South-RO': 'TOSouth', 'Toronto South Regional Office': 'TOSouth', '25 Grosvenor St': 'TOSouth', '15 Grosvenor St': 'TOSouth', '79 St. Clair Ave': 'TOSouth'}

ajdf = pd.read_csv(mainpath+'output/annotated_judges.csv',sep='\t')
ajdf = ajdf[ajdf['jnew']!='missing']
jdict = dict(zip(ajdf['judge'],ajdf['jnew']))

In [6]:
metadf['jltext'] = metadf['cid'] + '_' + metadf['text']

In [7]:
textchunks = list(divide_chunks(metadf['jltext'].tolist(),100))
pbar = tqdm(textchunks)

  0%|                                                   | 0/365 [00:00<?, ?it/s]

In [8]:
""" using parallel processing to preprocess the case texts"""
ctresults = process_parallel(textchunks,"preprocess_list",ldict,jdict,stopwordslist,pattern,pbar)

 99%|████████████████████████████████████████▊| 363/365 [00:34<00:00, 17.77it/s]

In [9]:
ctresults = [x.get() for x in ctresults]
ctresults = [item for sublist in ctresults for item in sublist]
ctdict = parse_parallelresults(ctresults)
print(len(ctdict))
metadf['cleantext'] = metadf['cid'].map(ctdict)
metadf.replace(r'^\s*$', np.nan, regex=True,inplace=True)
metadf.fillna('missing',inplace=True)
print(metadf[metadf['cleantext']=='missing'].shape)

36499


100%|█████████████████████████████████████████| 365/365 [00:50<00:00, 17.77it/s]

(0, 16)


In [10]:
textchunks = list(divide_chunks(metadf['jltext'].tolist(),100))
pbar = tqdm(textchunks)


100%|█████████████████████████████████████████| 365/365 [00:59<00:00,  6.15it/s][A


In [11]:
""" using parallel processing to extract judge names and locations"""
jlresults = process_parallel(textchunks,"get_cleanjudloc",ldict,jdict,stopwordslist,pattern,pbar)
jlresults = [x.get() for x in jlresults]
jlresults = [item for sublist in jlresults for item in sublist]
jldict = parse_parallelresults(jlresults)
print(len(jldict))


  0%|                                         | 1/365 [00:12<1:15:35, 12.46s/it][A
  1%|▏                                          | 2/365 [00:16<47:03,  7.78s/it][A
  1%|▎                                          | 3/365 [00:17<26:32,  4.40s/it][A
  1%|▍                                          | 4/365 [00:18<19:32,  3.25s/it][A
  1%|▌                                          | 5/365 [00:19<14:48,  2.47s/it][A
  2%|▊                                          | 7/365 [00:21<08:54,  1.49s/it][A
  2%|▉                                          | 8/365 [00:24<12:14,  2.06s/it][A
  3%|█▏                                        | 10/365 [00:24<07:24,  1.25s/it][A
  3%|█▎                                        | 11/365 [00:31<14:36,  2.48s/it][A
  3%|█▍                                        | 12/365 [00:33<14:19,  2.44s/it][A
  4%|█▌                                        | 14/365 [00:37<12:17,  2.10s/it][A
  4%|█▋                                        | 15/365 [00:38<11:19,  1.94

36499


In [13]:
metadf['judgeloc'] = metadf['cid'].map(jldict)
metadf.replace(r'^\s*$', np.nan, regex=True,inplace=True)
metadf['judgeloc'].fillna('',inplace=True)
metadf['member'] = metadf['judgeloc'].apply(lambda x: '_'.join([i for i in x.split('|') if i not in list(ldict.values())]))
metadf['loc'] = metadf['judgeloc'].apply(lambda x: '_'.join([i for i in x.split('|') if i in list(ldict.values())]))
metadf.replace(r'^\s*$', np.nan, regex=True,inplace=True)
metadf.fillna('missing',inplace=True)
print(metadf[metadf['member']=='missing'].shape)
print(metadf[metadf['loc']=='missing'].shape)

(6261, 17)
(3487, 17)


In [25]:
metadf['membertype'] = metadf['text'].apply(lambda x: check_membertype(x[-math.ceil(len(x)/2):]))
metadf.fillna('missing',inplace=True)
print(metadf[metadf['membertype']=='missing'].shape)

(3022, 19)


In [26]:
apidf = pd.read_csv(mainpath + 'output/1_apicaseidx.csv',sep='\t')
webdf = pd.read_csv(mainpath + 'output/1_tablecaseidx.csv',sep='\t')
apidf = apidf[~apidf['cid'].isin(webdf['cid'].tolist())]
apiwebdf = pd.concat([apidf[['cid','fileno']],webdf[['cid','fileno']]])
filenodict = dict(zip(apiwebdf['cid'],apiwebdf['fileno']))
metadf['fileno'] = metadf['cid'].map(filenodict)
metadf.fillna('missing',inplace=True)
print(metadf[metadf['fileno']=='missing'].shape)

(0, 19)


In [28]:
metadf.to_csv(mainpath + 'output/3_metaclean.csv',sep='\t',index=False,quoting=csv.QUOTE_ALL)