In [1]:
# Data visualization
import matplotlib.pyplot as plt 

# Data manipulation
import pandas as pd
import numpy as np
import csv
from zipfile import ZipFile

In [2]:
# Webscraping
import glob
import requests
from bs4 import BeautifulSoup
import time
import datetime
from pandas.core.common import flatten
import os
from itertools import chain
from tqdm import tqdm
import json
import urllib.request

In [3]:
# Parsing and pre-processing
from glob import glob
import os 
import re

from pdfminer.high_level import extract_text
import pdfplumber
from langdetect import detect, DetectorFactory

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [4]:
import sys
sys.path.append(f'../../python')
from parser import parse_pdf, parse_pdf_section, clean_parsed_pdf_section

In [5]:
pdf_dir = f"../../../data/pdfs/"
art8_1_pdf_list = glob(os.path.join(pdf_dir, "art8.1/*/*.pdf")) #36
art8_1_pdf_list.sort(key=lambda x: os.path.getctime(x))

art8_2_pdf_list = glob(os.path.join(pdf_dir, "art8.2/*/*.pdf")) #72
art8_2_pdf_list.sort(key=lambda x: os.path.getctime(x))

art8_3_pdf_list = glob(os.path.join(pdf_dir, "art8.3/*/*.pdf")) #14
art8_3_pdf_list.sort(key=lambda x: os.path.getctime(x))

art6_1_pdf_list = glob(os.path.join(pdf_dir, "art6.1/*/*.pdf"))
art6_1_pdf_list.sort(key=lambda x: os.path.getctime(x))

art6_0_pdf_list = glob(os.path.join(pdf_dir, "art6.0/*/*.pdf"))
art6_0_pdf_list.sort(key=lambda x: os.path.getctime(x))

art9_3_pdf_list = glob(os.path.join(pdf_dir, "art9.3/*/*.pdf"))
art9_3_pdf_list.sort(key=lambda x: os.path.getctime(x))


### Fix parser by section (try)

In [6]:
df = parse_pdf_section(art8_1_pdf_list[0:2])

100%|██████████| 2/2 [00:09<00:00,  4.56s/it]


In [7]:
df_clean = clean_parsed_pdf_section(df)

In [8]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          37 non-null     object
 1   year          37 non-null     object
 2   len_pdf       37 non-null     object
 3   article       37 non-null     object
 4   article_txt   37 non-null     object
 5   article_62    37 non-null     object
 6   case_num      37 non-null     object
 7   filename      37 non-null     object
 8   section_text  37 non-null     object
 9   bsn_act       37 non-null     object
 10  simp_text     37 non-null     object
 11  section       37 non-null     object
dtypes: object(12)
memory usage: 3.6+ KB


In [9]:
df_60 = parse_pdf_section(art6_0_pdf_list[0:2])

100%|██████████| 2/2 [00:00<00:00,  3.93it/s]


In [10]:
df_60_clean = clean_parsed_pdf_section(df_60)

In [12]:
df_60_clean

Unnamed: 0,date,year,len_pdf,article,article_txt,article_62,case_num,filename,section_text,bsn_act,simp_text,section
0,Date: 01/12/2020,2020,3,art6.0,article6(1)(b),,M.10011,art6.0\M.10011\m10011_48_3,,EUROPEAN COMMISSION \nDG Competition \n \n \n ...,: \n — ORIX is a multinational integrated fi...,
1,Date: 17/11/2020,2020,3,art6.0,article6(1)(b),,M.10000,art6.0\M.10000\m10000_41_3,,\n \nEUROPEAN COMMISSION \nDG Competition \n ...,: \n PreZero provides waste disposal and rec...,


In [None]:
#TODO add code to those that do not detect subheadings right

#### Article 8.1 (36 cases)

In [13]:
df_81=parse_pdf_section(art8_1_pdf_list)

100%|██████████| 47/47 [08:59<00:00, 11.49s/it]


In [14]:
df_81_clean = clean_parsed_pdf_section(df_81)

In [15]:
# save json file name
date = datetime.date.today().strftime('%Y_%m_%d')

file_name = f"../../../data/parsed/parsed_art8.1_{date}.json"
if os.path.exists(file_name):
    os.remove(file_name)

# save file as json
df_81_clean.to_json(file_name)

#### Article 8.2 (73 cases)

In [8]:
# create dataframe
df_82 = pd.DataFrame(art8_2_pdf_list, columns = ['pdf_file'])

df_82['id'] = df_82.index
df_82['batch'] = ((df_82['id'])/20).astype(int)

In [9]:
df_82['batch'].value_counts()

0    20
1    20
2    20
3    20
4    20
5    14
Name: batch, dtype: int64

In [10]:
# parse pdf
for i in range(0,6): # 0-5
    globals()[f'df_8.2_{i}'] = clean_parsed_pdf_section(parse_pdf_section(tqdm(df_82[df_82['batch']==i]['pdf_file'])))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art8.2_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_8.2_{i}'].to_json(file_name)

100%|██████████| 20/20 [07:55<00:00, 23.80s/it]
100%|██████████| 20/20 [07:55<00:00, 23.79s/it]


Batch 0 completed


100%|██████████| 20/20 [11:55<00:00, 35.77s/it]
100%|██████████| 20/20 [11:55<00:00, 35.77s/it]


Batch 1 completed


100%|██████████| 20/20 [12:20<00:00, 37.04s/it]
100%|██████████| 20/20 [12:20<00:00, 37.04s/it]


Batch 2 completed


100%|██████████| 20/20 [09:50<00:00, 29.52s/it]
100%|██████████| 20/20 [09:50<00:00, 29.52s/it]


Batch 3 completed


100%|██████████| 20/20 [04:29<00:00, 13.45s/it]
100%|██████████| 20/20 [04:29<00:00, 13.45s/it]


Batch 4 completed


100%|██████████| 14/14 [5:02:04<00:00, 1294.59s/it]  
100%|██████████| 14/14 [5:02:04<00:00, 1294.59s/it]


Batch 5 completed


#### Article 8.3 (13 cases)

In [6]:
df_83 = parse_pdf_section(art8_3_pdf_list)

100%|██████████| 14/14 [11:28<00:00, 49.21s/it]


In [None]:
df_83_clean = clean_parsed_pdf_section(df_83)

In [13]:
# save json file name
date = datetime.date.today().strftime('%Y_%m_%d')

file_name = f"../../../data/parsed/parsed_art8.3_{date}.json"
if os.path.exists(file_name):
    os.remove(file_name)

# save file as json
df_83_clean.to_json(file_name)

#### Article 6.1(b)_cc (240 cases)

In [6]:
# create dataframe
df_61 = pd.DataFrame(art6_1_pdf_list, columns = ['pdf_file'])

df_61['id'] = df_61.index
df_61['batch'] = ((df_61['id'])/100).astype(int)

In [12]:
df_61['batch'].value_counts()

0    100
1    100
2    100
3     59
Name: batch, dtype: int64

In [7]:
# parse pdf
for i in range(0,4): # 0-3
    globals()[f'df_6.1_{i}'] = clean_parsed_pdf_section(parse_pdf_section(tqdm(df_61[df_61['batch']==i]['pdf_file'])))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art6.1_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_6.1_{i}'].to_json(file_name)

100%|██████████| 100/100 [10:03<00:00,  6.03s/it]
100%|██████████| 100/100 [10:03<00:00,  6.03s/it]


Batch 0 completed


100%|██████████| 100/100 [13:46<00:00,  8.27s/it]
100%|██████████| 100/100 [13:46<00:00,  8.27s/it]


Batch 1 completed


100%|██████████| 100/100 [12:28<00:00,  7.48s/it]
100%|██████████| 100/100 [12:28<00:00,  7.48s/it]


Batch 2 completed


100%|██████████| 59/59 [06:51<00:00,  6.97s/it]
100%|██████████| 59/59 [06:51<00:00,  6.97s/it]

Batch 3 completed





#### Article 6(b)_uc (5734 cases)

In [6]:
# create dataframe
df_60 = pd.DataFrame(art6_0_pdf_list, columns = ['pdf_file'])

df_60['id'] = df_60.index
df_60['batch'] = ((df_60['id'])/50).astype(int)

In [7]:
df_60.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4908 entries, 0 to 4907
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   pdf_file  4908 non-null   object
 1   id        4908 non-null   int64 
 2   batch     4908 non-null   int32 
dtypes: int32(1), int64(1), object(1)
memory usage: 96.0+ KB


In [8]:
df_60['batch'].value_counts()

0     50
74    50
72    50
71    50
70    50
      ..
30    50
29    50
28    50
27    50
98     8
Name: batch, Length: 99, dtype: int64

In [10]:
len(df_60['batch'].value_counts())

99

In [9]:
# parse pdf
for i in range(0,100): #0-99
    globals()[f'df_6.0_{i}'] = clean_parsed_pdf_section(parse_pdf_section(tqdm(df_60[df_60['batch']==i]['pdf_file'])))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art6.0_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_6.0_{i}'].to_json(file_name)

100%|██████████| 50/50 [00:44<00:00,  1.11it/s]
100%|██████████| 50/50 [00:44<00:00,  1.11it/s]


Batch 1 completed


100%|██████████| 50/50 [00:51<00:00,  1.02s/it]
100%|██████████| 50/50 [00:51<00:00,  1.02s/it]


Batch 2 completed


100%|██████████| 50/50 [00:31<00:00,  1.57it/s]
100%|██████████| 50/50 [00:31<00:00,  1.57it/s]


Batch 3 completed


100%|██████████| 50/50 [00:30<00:00,  1.65it/s]
100%|██████████| 50/50 [00:30<00:00,  1.65it/s]


Batch 4 completed


100%|██████████| 50/50 [00:30<00:00,  1.64it/s]
100%|██████████| 50/50 [00:30<00:00,  1.64it/s]


Batch 5 completed


100%|██████████| 50/50 [01:18<00:00,  1.56s/it]
100%|██████████| 50/50 [01:18<00:00,  1.56s/it]


Batch 6 completed


100%|██████████| 50/50 [02:01<00:00,  2.42s/it]
100%|██████████| 50/50 [02:01<00:00,  2.42s/it]


Batch 7 completed


100%|██████████| 50/50 [01:24<00:00,  1.69s/it]
100%|██████████| 50/50 [01:24<00:00,  1.69s/it]


Batch 8 completed


100%|██████████| 50/50 [00:42<00:00,  1.18it/s]
100%|██████████| 50/50 [00:42<00:00,  1.18it/s]


Batch 9 completed


100%|██████████| 50/50 [01:39<00:00,  2.00s/it]
100%|██████████| 50/50 [01:39<00:00,  2.00s/it]


Batch 10 completed


100%|██████████| 50/50 [00:51<00:00,  1.03s/it]
100%|██████████| 50/50 [00:51<00:00,  1.03s/it]


Batch 11 completed


100%|██████████| 50/50 [00:41<00:00,  1.19it/s]
100%|██████████| 50/50 [00:41<00:00,  1.19it/s]


Batch 12 completed


100%|██████████| 50/50 [01:23<00:00,  1.66s/it]
100%|██████████| 50/50 [01:23<00:00,  1.66s/it]


Batch 13 completed


100%|██████████| 50/50 [00:41<00:00,  1.19it/s]
100%|██████████| 50/50 [00:41<00:00,  1.19it/s]


Batch 14 completed


100%|██████████| 50/50 [00:38<00:00,  1.29it/s]
100%|██████████| 50/50 [00:38<00:00,  1.29it/s]


Batch 15 completed


100%|██████████| 50/50 [00:45<00:00,  1.09it/s]
100%|██████████| 50/50 [00:45<00:00,  1.09it/s]


Batch 16 completed


100%|██████████| 50/50 [00:27<00:00,  1.80it/s]
100%|██████████| 50/50 [00:27<00:00,  1.80it/s]


Batch 17 completed


100%|██████████| 50/50 [00:18<00:00,  2.67it/s]
100%|██████████| 50/50 [00:18<00:00,  2.67it/s]


Batch 18 completed


100%|██████████| 50/50 [00:17<00:00,  2.91it/s]
100%|██████████| 50/50 [00:17<00:00,  2.91it/s]


Batch 19 completed


100%|██████████| 50/50 [00:22<00:00,  2.24it/s]
100%|██████████| 50/50 [00:22<00:00,  2.24it/s]


Batch 20 completed


100%|██████████| 50/50 [00:20<00:00,  2.41it/s]
100%|██████████| 50/50 [00:20<00:00,  2.41it/s]


Batch 21 completed


100%|██████████| 50/50 [00:23<00:00,  2.11it/s]
100%|██████████| 50/50 [00:23<00:00,  2.11it/s]


Batch 22 completed


100%|██████████| 50/50 [01:18<00:00,  1.57s/it]
100%|██████████| 50/50 [01:18<00:00,  1.57s/it]


Batch 23 completed


100%|██████████| 50/50 [00:26<00:00,  1.88it/s]
100%|██████████| 50/50 [00:26<00:00,  1.88it/s]


Batch 24 completed


100%|██████████| 50/50 [00:21<00:00,  2.30it/s]
100%|██████████| 50/50 [00:21<00:00,  2.30it/s]


Batch 25 completed


100%|██████████| 50/50 [00:22<00:00,  2.19it/s]
100%|██████████| 50/50 [00:22<00:00,  2.19it/s]


Batch 26 completed


100%|██████████| 50/50 [00:25<00:00,  1.97it/s]
100%|██████████| 50/50 [00:25<00:00,  1.97it/s]


Batch 27 completed


100%|██████████| 50/50 [00:25<00:00,  1.98it/s]
100%|██████████| 50/50 [00:25<00:00,  1.98it/s]


Batch 28 completed


100%|██████████| 50/50 [00:21<00:00,  2.30it/s]
100%|██████████| 50/50 [00:21<00:00,  2.30it/s]


Batch 29 completed


100%|██████████| 50/50 [00:32<00:00,  1.52it/s]
100%|██████████| 50/50 [00:32<00:00,  1.52it/s]


Batch 30 completed


100%|██████████| 50/50 [01:16<00:00,  1.53s/it]
100%|██████████| 50/50 [01:16<00:00,  1.53s/it]


Batch 31 completed


100%|██████████| 50/50 [00:25<00:00,  1.99it/s]
100%|██████████| 50/50 [00:25<00:00,  1.99it/s]


Batch 32 completed


100%|██████████| 50/50 [1:30:45<00:00, 108.92s/it]   
100%|██████████| 50/50 [1:30:45<00:00, 108.92s/it]


Batch 33 completed


100%|██████████| 50/50 [00:30<00:00,  1.62it/s]
100%|██████████| 50/50 [00:30<00:00,  1.62it/s]


Batch 34 completed


100%|██████████| 50/50 [00:23<00:00,  2.15it/s]
100%|██████████| 50/50 [00:23<00:00,  2.15it/s]


Batch 35 completed


100%|██████████| 50/50 [00:28<00:00,  1.74it/s]
100%|██████████| 50/50 [00:28<00:00,  1.74it/s]


Batch 36 completed


100%|██████████| 50/50 [00:21<00:00,  2.30it/s]
100%|██████████| 50/50 [00:21<00:00,  2.30it/s]


Batch 37 completed


100%|██████████| 50/50 [2:01:43<00:00, 146.08s/it]    
100%|██████████| 50/50 [2:01:43<00:00, 146.07s/it]


Batch 38 completed


100%|██████████| 50/50 [01:08<00:00,  1.38s/it]
100%|██████████| 50/50 [01:08<00:00,  1.38s/it]


Batch 39 completed


100%|██████████| 50/50 [00:27<00:00,  1.80it/s]
100%|██████████| 50/50 [00:27<00:00,  1.80it/s]


Batch 40 completed


100%|██████████| 50/50 [00:26<00:00,  1.91it/s]
100%|██████████| 50/50 [00:26<00:00,  1.91it/s]


Batch 41 completed


100%|██████████| 50/50 [00:25<00:00,  1.99it/s]
100%|██████████| 50/50 [00:25<00:00,  2.00it/s]


Batch 42 completed


100%|██████████| 50/50 [00:22<00:00,  2.18it/s]
100%|██████████| 50/50 [00:22<00:00,  2.18it/s]


Batch 43 completed


100%|██████████| 50/50 [00:30<00:00,  1.63it/s]
100%|██████████| 50/50 [00:30<00:00,  1.64it/s]


Batch 44 completed


100%|██████████| 50/50 [00:19<00:00,  2.52it/s]
100%|██████████| 50/50 [00:19<00:00,  2.52it/s]


Batch 45 completed


100%|██████████| 50/50 [00:41<00:00,  1.19it/s]
100%|██████████| 50/50 [00:41<00:00,  1.19it/s]


Batch 46 completed


100%|██████████| 50/50 [00:49<00:00,  1.02it/s]
100%|██████████| 50/50 [00:49<00:00,  1.02it/s]


Batch 47 completed


100%|██████████| 50/50 [00:46<00:00,  1.08it/s]
100%|██████████| 50/50 [00:46<00:00,  1.08it/s]


Batch 48 completed


100%|██████████| 50/50 [00:31<00:00,  1.60it/s]
100%|██████████| 50/50 [00:31<00:00,  1.60it/s]


Batch 49 completed


100%|██████████| 50/50 [00:28<00:00,  1.74it/s]
100%|██████████| 50/50 [00:28<00:00,  1.74it/s]


Batch 50 completed


100%|██████████| 50/50 [00:33<00:00,  1.50it/s]
100%|██████████| 50/50 [00:33<00:00,  1.51it/s]


Batch 51 completed


100%|██████████| 50/50 [00:27<00:00,  1.83it/s]
100%|██████████| 50/50 [00:27<00:00,  1.83it/s]


Batch 52 completed


100%|██████████| 50/50 [00:54<00:00,  1.09s/it]
100%|██████████| 50/50 [00:54<00:00,  1.09s/it]


Batch 53 completed


100%|██████████| 50/50 [00:54<00:00,  1.08s/it]
100%|██████████| 50/50 [00:54<00:00,  1.08s/it]


Batch 54 completed


100%|██████████| 50/50 [09:05<00:00, 10.90s/it]   
100%|██████████| 50/50 [09:05<00:00, 10.90s/it]


Batch 55 completed


100%|██████████| 50/50 [00:17<00:00,  2.79it/s]
100%|██████████| 50/50 [00:17<00:00,  2.79it/s]


Batch 56 completed


100%|██████████| 50/50 [00:33<00:00,  1.51it/s]
100%|██████████| 50/50 [00:33<00:00,  1.51it/s]


Batch 57 completed


100%|██████████| 50/50 [00:34<00:00,  1.44it/s]
100%|██████████| 50/50 [00:34<00:00,  1.44it/s]


Batch 58 completed


100%|██████████| 50/50 [00:44<00:00,  1.12it/s]
100%|██████████| 50/50 [00:44<00:00,  1.12it/s]


Batch 59 completed


100%|██████████| 50/50 [00:36<00:00,  1.36it/s]
100%|██████████| 50/50 [00:36<00:00,  1.36it/s]


Batch 60 completed


100%|██████████| 50/50 [00:29<00:00,  1.71it/s]
100%|██████████| 50/50 [00:29<00:00,  1.71it/s]


Batch 61 completed


100%|██████████| 50/50 [00:42<00:00,  1.19it/s]
100%|██████████| 50/50 [00:42<00:00,  1.19it/s]


Batch 62 completed


100%|██████████| 50/50 [00:37<00:00,  1.34it/s]
100%|██████████| 50/50 [00:37<00:00,  1.34it/s]


Batch 63 completed


100%|██████████| 50/50 [00:39<00:00,  1.26it/s]
100%|██████████| 50/50 [00:39<00:00,  1.26it/s]


Batch 64 completed


100%|██████████| 50/50 [00:45<00:00,  1.11it/s]
100%|██████████| 50/50 [00:45<00:00,  1.11it/s]


Batch 65 completed


100%|██████████| 50/50 [01:02<00:00,  1.24s/it]
100%|██████████| 50/50 [01:02<00:00,  1.24s/it]


Batch 66 completed


100%|██████████| 50/50 [00:32<00:00,  1.53it/s]
100%|██████████| 50/50 [00:32<00:00,  1.53it/s]


Batch 67 completed


100%|██████████| 50/50 [00:41<00:00,  1.21it/s]
100%|██████████| 50/50 [00:41<00:00,  1.21it/s]


Batch 68 completed


100%|██████████| 50/50 [00:44<00:00,  1.14it/s]
100%|██████████| 50/50 [00:43<00:00,  1.14it/s]


Batch 69 completed


100%|██████████| 50/50 [00:29<00:00,  1.69it/s]
100%|██████████| 50/50 [00:29<00:00,  1.69it/s]


Batch 70 completed


100%|██████████| 50/50 [01:04<00:00,  1.29s/it]
100%|██████████| 50/50 [01:04<00:00,  1.29s/it]


Batch 71 completed


100%|██████████| 50/50 [01:04<00:00,  1.30s/it]
100%|██████████| 50/50 [01:04<00:00,  1.30s/it]


Batch 72 completed


100%|██████████| 50/50 [00:35<00:00,  1.43it/s]
100%|██████████| 50/50 [00:35<00:00,  1.43it/s]


Batch 73 completed


100%|██████████| 50/50 [00:52<00:00,  1.06s/it]
100%|██████████| 50/50 [00:52<00:00,  1.05s/it]


Batch 74 completed


100%|██████████| 50/50 [00:36<00:00,  1.36it/s]
100%|██████████| 50/50 [00:36<00:00,  1.36it/s]


Batch 75 completed


100%|██████████| 50/50 [00:51<00:00,  1.03s/it]
100%|██████████| 50/50 [00:51<00:00,  1.03s/it]


Batch 76 completed


100%|██████████| 50/50 [00:18<00:00,  2.65it/s]
100%|██████████| 50/50 [00:18<00:00,  2.65it/s]


Batch 77 completed


100%|██████████| 50/50 [00:25<00:00,  1.97it/s]
100%|██████████| 50/50 [00:25<00:00,  1.97it/s]


Batch 78 completed


100%|██████████| 50/50 [00:25<00:00,  1.95it/s]
100%|██████████| 50/50 [00:25<00:00,  1.95it/s]


Batch 79 completed


100%|██████████| 50/50 [00:36<00:00,  1.38it/s]
100%|██████████| 50/50 [00:36<00:00,  1.38it/s]


Batch 80 completed


100%|██████████| 50/50 [00:48<00:00,  1.04it/s]
100%|██████████| 50/50 [00:48<00:00,  1.04it/s]


Batch 81 completed


100%|██████████| 50/50 [00:26<00:00,  1.92it/s]
100%|██████████| 50/50 [00:26<00:00,  1.92it/s]


Batch 82 completed


100%|██████████| 50/50 [00:30<00:00,  1.64it/s]
100%|██████████| 50/50 [00:30<00:00,  1.64it/s]


Batch 83 completed


100%|██████████| 50/50 [00:42<00:00,  1.18it/s]
100%|██████████| 50/50 [00:42<00:00,  1.18it/s]


Batch 84 completed


100%|██████████| 50/50 [00:48<00:00,  1.02it/s]
100%|██████████| 50/50 [00:48<00:00,  1.02it/s]


Batch 85 completed


100%|██████████| 50/50 [00:41<00:00,  1.21it/s]
100%|██████████| 50/50 [00:41<00:00,  1.21it/s]


Batch 86 completed


100%|██████████| 50/50 [00:48<00:00,  1.04it/s]
100%|██████████| 50/50 [00:48<00:00,  1.04it/s]


Batch 87 completed


100%|██████████| 50/50 [01:35<00:00,  1.92s/it]
100%|██████████| 50/50 [01:35<00:00,  1.92s/it]


Batch 88 completed


100%|██████████| 50/50 [00:46<00:00,  1.08it/s]
100%|██████████| 50/50 [00:46<00:00,  1.08it/s]


Batch 89 completed


100%|██████████| 50/50 [00:35<00:00,  1.40it/s]
100%|██████████| 50/50 [00:35<00:00,  1.40it/s]


Batch 90 completed


100%|██████████| 50/50 [00:30<00:00,  1.62it/s]
100%|██████████| 50/50 [00:30<00:00,  1.62it/s]


Batch 91 completed


100%|██████████| 50/50 [01:14<00:00,  1.49s/it]
100%|██████████| 50/50 [01:14<00:00,  1.49s/it]


Batch 92 completed


100%|██████████| 50/50 [00:46<00:00,  1.08it/s]
100%|██████████| 50/50 [00:46<00:00,  1.08it/s]


Batch 93 completed


100%|██████████| 50/50 [00:44<00:00,  1.12it/s]
100%|██████████| 50/50 [00:44<00:00,  1.12it/s]


Batch 94 completed


100%|██████████| 50/50 [00:41<00:00,  1.20it/s]
100%|██████████| 50/50 [00:41<00:00,  1.20it/s]


Batch 95 completed


100%|██████████| 50/50 [01:01<00:00,  1.23s/it]
100%|██████████| 50/50 [01:01<00:00,  1.23s/it]


Batch 96 completed


100%|██████████| 50/50 [00:38<00:00,  1.29it/s]
100%|██████████| 50/50 [00:38<00:00,  1.29it/s]


Batch 97 completed


100%|██████████| 8/8 [00:05<00:00,  1.51it/s]
100%|██████████| 8/8 [00:05<00:00,  1.51it/s]


Batch 98 completed


0it [00:00, ?it/s]
0it [00:00, ?it/s]


KeyError: 'simp_text'

#### Article 9.3 (129 cases)

In [11]:
# create dataframe
df_93 = pd.DataFrame(art9_3_pdf_list, columns = ['pdf_file'])

df_93['id'] = df_93.index
df_93['batch'] = ((df_93['id'])/100).astype(int)

In [12]:
df_93['batch'].value_counts()

0    100
1     92
Name: batch, dtype: int64

In [15]:
# parse pdf
for i in range(0,1): #0-1
    globals()[f'df_9.3_{i}'] = clean_parsed_pdf_section(parse_pdf_section(tqdm(df_93[df_93['batch']==i]['pdf_file'])))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art9.3_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_9.3_{i}'].to_json(file_name)

100%|██████████| 100/100 [04:53<00:00,  2.93s/it]
100%|██████████| 100/100 [04:53<00:00,  2.93s/it]


Batch 0 completed


In [13]:
# parse pdf
for i in range(1,2): #0-1
    globals()[f'df_9.3_{i}'] = clean_parsed_pdf_section(parse_pdf_section(tqdm(df_93[df_93['batch']==i]['pdf_file'])))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art9.3_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_9.3_{i}'].to_json(file_name)

100%|██████████| 92/92 [09:26<00:00,  6.15s/it]
100%|██████████| 92/92 [09:26<00:00,  6.15s/it]


Batch 1 completed


# Append all dataframes

In [42]:
import glob
import os 

path = f"../../../data/parsed/*.json"
files = glob.glob(path)

In [190]:
data_merged = pd.concat([pd.read_json(f) for f in files])

In [191]:
data_merged=data_merged.reset_index(drop=True)

In [192]:
# change section_text to simp_text where len_pdf <= 5 or section == "None"
data_merged.loc[data_merged["len_pdf"] <= 5, "section_text"] = data_merged.loc[data_merged["len_pdf"] <= 5, "simp_text"]

In [193]:
import re

def _parse_simplified_text2(text):
    # Define the start and end patterns
    start_pattern = re.compile(r'(?i)business activities of the undertakings concerned are|business activities')
    end_pattern = re.compile(r'(?i)after examination of the notification|\n \n|\n\n')

    result = "None"
    try:
        # Find the start and end positions of the patterns
        start_pos = start_pattern.search(text).end()
        end_match = end_pattern.search(text[start_pos:])
        if end_match is None:
            end_pos = len(text)
        else:
            end_pos = start_pos + end_match.start()

        # Extract the text between the patterns
        result = text[start_pos:end_pos].strip()
    except AttributeError:
        pass
    except UnboundLocalError:
        pass

    return result


In [194]:
data_merged.loc[data_merged['section_text'] == "None", "section_text"] = data_merged.loc[data_merged['section_text'] == "None", "bsn_act"].apply(_parse_simplified_text2)

In [195]:
len(data_merged[data_merged['section_text'] == "None"])

47

In [196]:
# # Join the texts in 'section_text' for each row using '\n' as the separator
data_merged['sec_text'] = data_merged['section_text'].apply(lambda lst: '\n'.join(lst))

# Reset the index of the DataFrame
data_merged = data_merged.reset_index(drop=True)

In [197]:
len(data_merged[data_merged['sec_text'] == ""])

1

In [198]:
# Drop if section_text is "None"
data_merged = data_merged[data_merged['section_text'] != "None"].reset_index(drop=True)

In [199]:
# Drop if sec_text is ""
data_merged = data_merged[data_merged['sec_text'] != ""].reset_index(drop=True)

In [201]:
data_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28997 entries, 0 to 28996
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          28997 non-null  object 
 1   year          26085 non-null  float64
 2   len_pdf       28997 non-null  int64  
 3   article       28997 non-null  object 
 4   article_txt   28997 non-null  object 
 5   article_62    28997 non-null  object 
 6   case_num      28997 non-null  object 
 7   filename      28997 non-null  object 
 8   section_text  28997 non-null  object 
 9   bsn_act       28997 non-null  object 
 10  simp_text     28997 non-null  object 
 11  section       25587 non-null  object 
 12  sec_text      28997 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 2.9+ MB


In [202]:
data_merged['sec_text']

0        EUROPEAN COMMISSION\nBrussels, 05.03.2021\nC(2...
1        In the published version of this decision,\nso...
2        Dear Sir or Madam,\n(1)  Following a referral ...
3        (2)  Microsoft is a global technology company,...
4        (4)  The Transaction will be implemented by me...
                               ...                        
28992    :\n \n\n\n\n \n \nf\no\nr\n \n \nC\no\nr\nd\n...
28993    :\n \n\n\n\n \n \nf\no\nr\n \nO\nn\ne\nx\n:\n...
28994    :\n \n\n\n\n \n \nR\nE\nW\nE\n \ni\ns\n \nm\n...
28995    :\n \n\n\n–\n \nf\no\nr\n \nC\na\nr\ng\ni\nl\n...
28996    :\n \n\n\n\n \n \nf\no\nr\n \nA\nM\nC\n:\n \n...
Name: sec_text, Length: 28997, dtype: object

In [203]:
# save json file name
date = datetime.date.today().strftime('%Y_%m_%d')

file_name = f"../../../data/processed/data_merged_{date}.json"
if os.path.exists(file_name):
    os.remove(file_name)

# save file as json
data_merged.to_json(file_name)