In [2]:
# Data manipulation
import pandas as pd
import numpy as np

In [3]:
# Webscraping
import glob
import requests
from itertools import chain
from tqdm import tqdm
import datetime

In [4]:
# Parsing and pre-processing
from glob import glob
import os 
import re

from pdfminer.high_level import extract_text
import pdfplumber
from langdetect import detect, DetectorFactory

In [5]:
import sys
sys.path.append(f'../../python')
from parser_commitments import parse_pdf_section, clean_parsed_pdf_section

In [6]:
pdf_dir = f"../../../data/pdfs/"
art8_1_pdf_list = glob(os.path.join(pdf_dir, "art8.1/*/*.pdf")) #36
art8_1_pdf_list.sort(key=lambda x: os.path.getctime(x))

art8_2_pdf_list = glob(os.path.join(pdf_dir, "art8.2/*/*.pdf")) #72
art8_2_pdf_list.sort(key=lambda x: os.path.getctime(x))

art8_3_pdf_list = glob(os.path.join(pdf_dir, "art8.3/*/*.pdf")) #14
art8_3_pdf_list.sort(key=lambda x: os.path.getctime(x))

art6_1_pdf_list = glob(os.path.join(pdf_dir, "art6.1/*/*.pdf"))
art6_1_pdf_list.sort(key=lambda x: os.path.getctime(x))

art6_0_pdf_list = glob(os.path.join(pdf_dir, "art6.0/*/*.pdf"))
art6_0_pdf_list.sort(key=lambda x: os.path.getctime(x))

art9_3_pdf_list = glob(os.path.join(pdf_dir, "art9.3/*/*.pdf"))
art9_3_pdf_list.sort(key=lambda x: os.path.getctime(x))


#### Article 8.1 (36 cases)

In [7]:
df_81=parse_pdf_section(art8_1_pdf_list)

100%|██████████| 47/47 [11:21<00:00, 14.50s/it]


In [8]:
df_81_clean = clean_parsed_pdf_section(df_81)

In [9]:
# save json file name
date = datetime.date.today().strftime('%Y_%m_%d')

file_name = f"../../../data/parsed/parsed_art8.1_{date}.json"
if os.path.exists(file_name):
    os.remove(file_name)

# save file as json
df_81_clean.to_json(file_name)

#### Article 8.2 (73 cases)

In [8]:
# create dataframe
df_82 = pd.DataFrame(art8_2_pdf_list, columns = ['pdf_file'])

df_82['id'] = df_82.index
df_82['batch'] = ((df_82['id'])/20).astype(int)

In [9]:
df_82['batch'].value_counts()

0    20
1    20
2    20
3    20
4    20
5    14
Name: batch, dtype: int64

In [10]:
# parse pdf
for i in range(0,6): # 0-5
    globals()[f'df_8.2_{i}'] = clean_parsed_pdf_section(parse_pdf_section(tqdm(df_82[df_82['batch']==i]['pdf_file'])))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art8.2_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_8.2_{i}'].to_json(file_name)

100%|██████████| 20/20 [07:55<00:00, 23.80s/it]
100%|██████████| 20/20 [07:55<00:00, 23.79s/it]


Batch 0 completed


100%|██████████| 20/20 [11:55<00:00, 35.77s/it]
100%|██████████| 20/20 [11:55<00:00, 35.77s/it]


Batch 1 completed


100%|██████████| 20/20 [12:20<00:00, 37.04s/it]
100%|██████████| 20/20 [12:20<00:00, 37.04s/it]


Batch 2 completed


100%|██████████| 20/20 [09:50<00:00, 29.52s/it]
100%|██████████| 20/20 [09:50<00:00, 29.52s/it]


Batch 3 completed


100%|██████████| 20/20 [04:29<00:00, 13.45s/it]
100%|██████████| 20/20 [04:29<00:00, 13.45s/it]


Batch 4 completed


100%|██████████| 14/14 [5:02:04<00:00, 1294.59s/it]  
100%|██████████| 14/14 [5:02:04<00:00, 1294.59s/it]


Batch 5 completed


#### Article 8.3 (13 cases)

In [6]:
df_83 = parse_pdf_section(art8_3_pdf_list)

100%|██████████| 14/14 [11:28<00:00, 49.21s/it]


In [None]:
df_83_clean = clean_parsed_pdf_section(df_83)

In [13]:
# save json file name
date = datetime.date.today().strftime('%Y_%m_%d')

file_name = f"../../../data/parsed/parsed_art8.3_{date}.json"
if os.path.exists(file_name):
    os.remove(file_name)

# save file as json
df_83_clean.to_json(file_name)

#### Article 6.1(b)_cc (240 cases)

In [6]:
# create dataframe
df_61 = pd.DataFrame(art6_1_pdf_list, columns = ['pdf_file'])

df_61['id'] = df_61.index
df_61['batch'] = ((df_61['id'])/100).astype(int)

In [12]:
df_61['batch'].value_counts()

0    100
1    100
2    100
3     59
Name: batch, dtype: int64

In [7]:
# parse pdf
for i in range(0,4): # 0-3
    globals()[f'df_6.1_{i}'] = clean_parsed_pdf_section(parse_pdf_section(tqdm(df_61[df_61['batch']==i]['pdf_file'])))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art6.1_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_6.1_{i}'].to_json(file_name)

100%|██████████| 100/100 [10:03<00:00,  6.03s/it]
100%|██████████| 100/100 [10:03<00:00,  6.03s/it]


Batch 0 completed


100%|██████████| 100/100 [13:46<00:00,  8.27s/it]
100%|██████████| 100/100 [13:46<00:00,  8.27s/it]


Batch 1 completed


100%|██████████| 100/100 [12:28<00:00,  7.48s/it]
100%|██████████| 100/100 [12:28<00:00,  7.48s/it]


Batch 2 completed


100%|██████████| 59/59 [06:51<00:00,  6.97s/it]
100%|██████████| 59/59 [06:51<00:00,  6.97s/it]

Batch 3 completed





#### Article 6(b)_uc (5734 cases)

In [14]:
# create dataframe
df_60 = pd.DataFrame(art6_0_pdf_list, columns = ['pdf_file'])

df_60['id'] = df_60.index
df_60['batch'] = ((df_60['id'])/100).astype(int)

In [15]:
df_60.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4908 entries, 0 to 4907
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   pdf_file  4908 non-null   object
 1   id        4908 non-null   int64 
 2   batch     4908 non-null   int32 
dtypes: int32(1), int64(1), object(1)
memory usage: 96.0+ KB


In [17]:
len(df_60['batch'].value_counts())

50

In [18]:
# parse pdf
for i in range(0,25): #0-24
    globals()[f'df_6.0_{i}'] = clean_parsed_pdf_section(parse_pdf_section(tqdm(df_60[df_60['batch']==i]['pdf_file'])))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art6.0_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_6.0_{i}'].to_json(file_name)

100%|██████████| 100/100 [01:59<00:00,  1.20s/it]
100%|██████████| 100/100 [01:59<00:00,  1.20s/it]


Batch 0 completed


100%|██████████| 100/100 [02:24<00:00,  1.44s/it]
100%|██████████| 100/100 [02:24<00:00,  1.44s/it]


Batch 1 completed


100%|██████████| 100/100 [01:33<00:00,  1.07it/s]
100%|██████████| 100/100 [01:33<00:00,  1.07it/s]


Batch 2 completed


100%|██████████| 100/100 [05:01<00:00,  3.01s/it]
100%|██████████| 100/100 [05:01<00:00,  3.01s/it]


Batch 3 completed


100%|██████████| 100/100 [03:36<00:00,  2.17s/it]
100%|██████████| 100/100 [03:36<00:00,  2.17s/it]


Batch 4 completed


100%|██████████| 100/100 [03:39<00:00,  2.20s/it]
100%|██████████| 100/100 [03:39<00:00,  2.20s/it]


Batch 5 completed


100%|██████████| 100/100 [03:16<00:00,  1.96s/it]
100%|██████████| 100/100 [03:16<00:00,  1.96s/it]


Batch 6 completed


100%|██████████| 100/100 [02:10<00:00,  1.30s/it]
100%|██████████| 100/100 [02:10<00:00,  1.30s/it]


Batch 7 completed


100%|██████████| 100/100 [01:56<00:00,  1.17s/it]
100%|██████████| 100/100 [01:56<00:00,  1.17s/it]


Batch 8 completed


100%|██████████| 100/100 [00:53<00:00,  1.86it/s]
100%|██████████| 100/100 [00:53<00:00,  1.86it/s]


Batch 9 completed


100%|██████████| 100/100 [01:07<00:00,  1.49it/s]
100%|██████████| 100/100 [01:07<00:00,  1.49it/s]


Batch 10 completed


100%|██████████| 100/100 [02:19<00:00,  1.39s/it]
100%|██████████| 100/100 [02:19<00:00,  1.39s/it]


Batch 11 completed


100%|██████████| 100/100 [01:15<00:00,  1.33it/s]
100%|██████████| 100/100 [01:15<00:00,  1.33it/s]


Batch 12 completed


100%|██████████| 100/100 [01:14<00:00,  1.34it/s]
100%|██████████| 100/100 [01:14<00:00,  1.34it/s]


Batch 13 completed


100%|██████████| 100/100 [01:11<00:00,  1.40it/s]
100%|██████████| 100/100 [01:11<00:00,  1.40it/s]


Batch 14 completed


100%|██████████| 100/100 [02:45<00:00,  1.65s/it]
100%|██████████| 100/100 [02:45<00:00,  1.65s/it]


Batch 15 completed


100%|██████████| 100/100 [01:12<00:00,  1.38it/s]
100%|██████████| 100/100 [01:12<00:00,  1.38it/s]


Batch 16 completed


100%|██████████| 100/100 [01:08<00:00,  1.46it/s]
100%|██████████| 100/100 [01:08<00:00,  1.46it/s]


Batch 17 completed


100%|██████████| 100/100 [01:14<00:00,  1.34it/s]
100%|██████████| 100/100 [01:14<00:00,  1.34it/s]


Batch 18 completed


100%|██████████| 100/100 [02:20<00:00,  1.40s/it]
100%|██████████| 100/100 [02:20<00:00,  1.40s/it]


Batch 19 completed


100%|██████████| 100/100 [01:18<00:00,  1.27it/s]
100%|██████████| 100/100 [01:18<00:00,  1.27it/s]


Batch 20 completed


100%|██████████| 100/100 [01:15<00:00,  1.33it/s]
100%|██████████| 100/100 [01:15<00:00,  1.33it/s]


Batch 21 completed


100%|██████████| 100/100 [01:15<00:00,  1.32it/s]
100%|██████████| 100/100 [01:15<00:00,  1.32it/s]


Batch 22 completed


100%|██████████| 100/100 [02:18<00:00,  1.39s/it]
100%|██████████| 100/100 [02:18<00:00,  1.39s/it]


Batch 23 completed


100%|██████████| 100/100 [02:05<00:00,  1.26s/it]
100%|██████████| 100/100 [02:05<00:00,  1.26s/it]

Batch 24 completed





In [22]:
# parse pdf
for i in range(25,51): #25-50
    globals()[f'df_6.0_{i}'] = clean_parsed_pdf_section(parse_pdf_section(tqdm(df_60[df_60['batch']==i]['pdf_file'])))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art6.0_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_6.0_{i}'].to_json(file_name)

100%|██████████| 100/100 [01:28<00:00,  1.13it/s]
100%|██████████| 100/100 [01:28<00:00,  1.13it/s]


Batch 25 completed


100%|██████████| 100/100 [01:53<00:00,  1.14s/it]
100%|██████████| 100/100 [01:53<00:00,  1.14s/it]


Batch 26 completed


100%|██████████| 100/100 [02:14<00:00,  1.35s/it]
100%|██████████| 100/100 [02:14<00:00,  1.35s/it]


Batch 27 completed


100%|██████████| 100/100 [01:11<00:00,  1.40it/s]
100%|██████████| 100/100 [01:11<00:00,  1.40it/s]


Batch 28 completed


100%|██████████| 100/100 [01:44<00:00,  1.04s/it]
100%|██████████| 100/100 [01:44<00:00,  1.04s/it]


Batch 29 completed


100%|██████████| 100/100 [01:22<00:00,  1.21it/s]
100%|██████████| 100/100 [01:22<00:00,  1.21it/s]


Batch 30 completed


100%|██████████| 100/100 [01:41<00:00,  1.02s/it]
100%|██████████| 100/100 [01:41<00:00,  1.02s/it]


Batch 31 completed


100%|██████████| 100/100 [01:45<00:00,  1.05s/it]
100%|██████████| 100/100 [01:45<00:00,  1.05s/it]


Batch 32 completed


100%|██████████| 100/100 [01:54<00:00,  1.14s/it]
100%|██████████| 100/100 [01:54<00:00,  1.14s/it]


Batch 33 completed


100%|██████████| 100/100 [01:45<00:00,  1.05s/it]
100%|██████████| 100/100 [01:45<00:00,  1.05s/it]


Batch 34 completed


100%|██████████| 100/100 [02:10<00:00,  1.31s/it]
100%|██████████| 100/100 [02:10<00:00,  1.31s/it]


Batch 35 completed


100%|██████████| 100/100 [02:19<00:00,  1.40s/it]
100%|██████████| 100/100 [02:19<00:00,  1.40s/it]


Batch 36 completed


100%|██████████| 100/100 [02:01<00:00,  1.22s/it]
100%|██████████| 100/100 [02:01<00:00,  1.22s/it]


Batch 37 completed


100%|██████████| 100/100 [01:38<00:00,  1.01it/s]
100%|██████████| 100/100 [01:38<00:00,  1.01it/s]


Batch 38 completed


100%|██████████| 100/100 [01:12<00:00,  1.38it/s]
100%|██████████| 100/100 [01:12<00:00,  1.38it/s]


Batch 39 completed


100%|██████████| 100/100 [01:57<00:00,  1.17s/it]
100%|██████████| 100/100 [01:57<00:00,  1.17s/it]


Batch 40 completed


100%|██████████| 100/100 [01:19<00:00,  1.25it/s]
100%|██████████| 100/100 [01:19<00:00,  1.25it/s]


Batch 41 completed


100%|██████████| 100/100 [02:05<00:00,  1.25s/it]
100%|██████████| 100/100 [02:05<00:00,  1.25s/it]


Batch 42 completed


100%|██████████| 100/100 [01:53<00:00,  1.13s/it]
100%|██████████| 100/100 [01:53<00:00,  1.13s/it]


Batch 43 completed


100%|██████████| 100/100 [02:57<00:00,  1.77s/it]
100%|██████████| 100/100 [02:57<00:00,  1.77s/it]


Batch 44 completed


100%|██████████| 100/100 [01:27<00:00,  1.15it/s]
100%|██████████| 100/100 [01:27<00:00,  1.15it/s]


Batch 45 completed


100%|██████████| 100/100 [02:33<00:00,  1.53s/it]
100%|██████████| 100/100 [02:33<00:00,  1.53s/it]


Batch 46 completed


100%|██████████| 100/100 [01:42<00:00,  1.03s/it]
100%|██████████| 100/100 [01:42<00:00,  1.03s/it]


Batch 47 completed


100%|██████████| 100/100 [02:17<00:00,  1.37s/it]
100%|██████████| 100/100 [02:17<00:00,  1.37s/it]


Batch 48 completed


100%|██████████| 8/8 [00:06<00:00,  1.17it/s]
100%|██████████| 8/8 [00:06<00:00,  1.17it/s]


Batch 49 completed


0it [00:00, ?it/s]
0it [00:00, ?it/s]

Batch 50 completed





#### Article 9.3 (129 cases)

In [11]:
# create dataframe
df_93 = pd.DataFrame(art9_3_pdf_list, columns = ['pdf_file'])

df_93['id'] = df_93.index
df_93['batch'] = ((df_93['id'])/100).astype(int)

In [12]:
df_93['batch'].value_counts()

0    100
1     92
Name: batch, dtype: int64

In [15]:
# parse pdf
for i in range(0,1): #0-1
    globals()[f'df_9.3_{i}'] = clean_parsed_pdf_section(parse_pdf_section(tqdm(df_93[df_93['batch']==i]['pdf_file'])))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art9.3_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_9.3_{i}'].to_json(file_name)

100%|██████████| 100/100 [04:53<00:00,  2.93s/it]
100%|██████████| 100/100 [04:53<00:00,  2.93s/it]


Batch 0 completed


In [13]:
# parse pdf
for i in range(1,2): #0-1
    globals()[f'df_9.3_{i}'] = clean_parsed_pdf_section(parse_pdf_section(tqdm(df_93[df_93['batch']==i]['pdf_file'])))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art9.3_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_9.3_{i}'].to_json(file_name)

100%|██████████| 92/92 [09:26<00:00,  6.15s/it]
100%|██████████| 92/92 [09:26<00:00,  6.15s/it]


Batch 1 completed


# Append all dataframes

In [23]:
import glob
import os 

path = f"../../../data/parsed/*.json"
files = glob.glob(path)

In [24]:
data_merged = pd.concat([pd.read_json(f) for f in files])

In [25]:
data_merged=data_merged.reset_index(drop=True)

In [26]:
# save json file name
date = datetime.date.today().strftime('%Y_%m_%d')

file_name = f"../../../data/processed/data_merged_{date}.json"
if os.path.exists(file_name):
    os.remove(file_name)

# save file as json
data_merged.to_json(file_name)

In [40]:
# no. of cases by article_txt
data_merged.groupby('article_62')['case_num'].nunique().sort_index()

article_62
None                            5074
inconjunctionwithart6(2)         145
inconjunctionwitharticle6(2)      17
Name: case_num, dtype: int64

In [37]:
# no. of cases by article_txt
data_merged.groupby('article_txt')['case_num'].nunique().sort_index()

article_txt
None                90
article17(2)         8
article21            1
article22            4
article22(1)         1
article22(3)        15
article232           1
article4             1
article4(4)         14
article6             5
article6(1)          9
article6(1)(a)       1
article6(1)(b)    4956
article6(2)         55
article6(4)          1
article7(3)         31
article8(1)         33
article8(2)         63
article8(3)         10
article8(4)          1
article9             3
article9(2)          1
article9(3)         24
article9(3)(b)       2
Name: case_num, dtype: int64

In [38]:
# no of documents by article_new
data_merged.groupby('article_new')['file'].nunique().sort_index()

article_new
None               129
article17(2)         8
article21            1
article22(1)         1
article232           1
article4             1
article6             5
article6(1)(a)       1
article6(1)(b)    4806
article6(2)        219
article6(4)          1
article7(3)         32
article8(1)         33
article8(2)         64
article8(3)         10
article8(4)          1
article9             3
article9(2)          1
referral            92
Name: file, dtype: int64

In [36]:
# no of rows by article
data_merged['article_new'].value_counts().sort_index()

None                469
article17(2)         45
article21             9
article22(1)          8
article232            4
article4              1
article6              5
article6(1)(a)        3
article6(1)(b)    11105
article6(2)        1888
article6(4)          14
article7(3)         170
article8(1)         257
article8(2)         821
article8(3)         105
article8(4)           1
article9             17
article9(2)           6
referral            875
Name: article_new, dtype: int64

In [39]:
# no of cases by year
data_merged.groupby('year')['case_num'].nunique().sort_index()

year
2004.0    111
2005.0    251
2006.0    310
2007.0    344
2008.0    267
2009.0    204
2010.0    224
2011.0    269
2012.0    198
2013.0    238
2014.0    259
2015.0    272
2016.0    318
2017.0    280
2018.0    349
2019.0    338
2020.0    322
2021.0    355
2022.0    301
2023.0      6
Name: case_num, dtype: int64