In [1]:
# Data visualization
import matplotlib.pyplot as plt 

# Data manipulation
import pandas as pd
import numpy as np
import csv
from zipfile import ZipFile

In [2]:
# Webscraping
import glob
import requests
from bs4 import BeautifulSoup
import time
import datetime
from pandas.core.common import flatten
import os
from itertools import chain
from tqdm import tqdm
import json
import urllib.request

In [3]:
# Parsing and pre-processing
from glob import glob
import os 
import re

from pdfminer.high_level import extract_text
import pdfplumber
from langdetect import detect, DetectorFactory

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [4]:
import sys
sys.path.append(f'../../python')
from scraper import createFolder, get_merger_links, download_pdf, get_merger_info, parse_pdf

In [5]:
pdf_dir = f"../../../data/pdfs/"
art8_1_pdf_list = glob(os.path.join(pdf_dir, "art8.1/*/*.pdf")) #36
art8_1_pdf_list.sort(key=lambda x: os.path.getctime(x))

art8_2_pdf_list = glob(os.path.join(pdf_dir, "art8.2/*/*.pdf")) #72
art8_2_pdf_list.sort(key=lambda x: os.path.getctime(x))

art8_3_pdf_list = glob(os.path.join(pdf_dir, "art8.3/*/*.pdf")) #14
art8_3_pdf_list.sort(key=lambda x: os.path.getctime(x))

art6_1_pdf_list = glob(os.path.join(pdf_dir, "art6.1/*/*.pdf"))
art6_1_pdf_list.sort(key=lambda x: os.path.getctime(x))

art6_0_pdf_list = glob(os.path.join(pdf_dir, "art6.0/*/*.pdf"))
art6_0_pdf_list.sort(key=lambda x: os.path.getctime(x))

art9_3_pdf_list = glob(os.path.join(pdf_dir, "art9.3/*/*.pdf"))
art9_3_pdf_list.sort(key=lambda x: os.path.getctime(x))


In [6]:
art6_0_pdf_list

['../../../data/pdfs/art6.0\\M.10000\\m10000_41_3.pdf',
 '../../../data/pdfs/art6.0\\M.10011\\m10011_48_3.pdf',
 '../../../data/pdfs/art6.0\\M.10012\\m10012_75_3.pdf',
 '../../../data/pdfs/art6.0\\M.10001\\m10001_438_3.pdf',
 '../../../data/pdfs/art6.0\\M.10003\\m10003_69_3.pdf',
 '../../../data/pdfs/art6.0\\M.10004\\m10004_67_3.pdf',
 '../../../data/pdfs/art6.0\\M.10005\\m10005_109_3.pdf',
 '../../../data/pdfs/art6.0\\M.10010\\m10010_118_3.pdf',
 '../../../data/pdfs/art6.0\\M.10013\\m10013_79_3.pdf',
 '../../../data/pdfs/art6.0\\M.10014\\m10014_88_4.pdf',
 '../../../data/pdfs/art6.0\\M.10019\\m10019_113_3.pdf',
 '../../../data/pdfs/art6.0\\M.10021\\m10021_69_3.pdf',
 '../../../data/pdfs/art6.0\\M.10022\\m10022_108_3.pdf',
 '../../../data/pdfs/art6.0\\M.10023\\m10023_85_3.pdf',
 '../../../data/pdfs/art6.0\\M.10024\\m10024_136_3.pdf',
 '../../../data/pdfs/art6.0\\M.10027\\m10027_87_3.pdf',
 '../../../data/pdfs/art6.0\\M.10028\\m10028_137_3.pdf',
 '../../../data/pdfs/art6.0\\M.10029\\m10

#### Article 8.1 (36 cases)

In [14]:
df1=parse_pdf(art8_1_pdf_list)

100%|██████████| 47/47 [08:59<00:00, 11.48s/it]


In [15]:
# save json file name
date = datetime.date.today().strftime('%Y_%m_%d')

file_name = f"../../../data/parsed/parsed_art8.1_{date}.json"
if os.path.exists(file_name):
    os.remove(file_name)

# save file as json
df1.to_json(file_name)

#### Article 8.2 (73 cases)

In [14]:
# create dataframe
df_82 = pd.DataFrame(art8_2_pdf_list, columns = ['pdf_file'])

df_82['id'] = df_82.index
df_82['batch'] = ((df_82['id'])/100).astype(int)

In [15]:
df_82['batch'].value_counts()

0    100
1     14
Name: batch, dtype: int64

In [16]:
# parse pdf
for i in range(0,2): # 0-1
    globals()[f'df_8.2_{i}'] = parse_pdf(tqdm(df_82[df_82['batch']==i]['pdf_file']))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art8.2_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_8.2_{i}'].to_json(file_name)

  4%|▍         | 4/100 [01:24<33:54, 21.20s/it]
  4%|▍         | 4/100 [01:24<33:54, 21.19s/it]


KeyboardInterrupt: 

#### Article 8.3 (13 cases)

In [6]:
df3 = parse_pdf(art8_3_pdf_list)

100%|██████████| 14/14 [11:28<00:00, 49.21s/it]


In [13]:
# save json file name
date = datetime.date.today().strftime('%Y_%m_%d')

file_name = f"../../../data/parsed/parsed_art8.3_{date}.json"
if os.path.exists(file_name):
    os.remove(file_name)

# save file as json
df3.to_json(file_name)

#### Article 6.1(b)_cc (240 cases)

In [6]:
# create dataframe
df_61 = pd.DataFrame(art6_1_pdf_list, columns = ['pdf_file'])

df_61['id'] = df_61.index
df_61['batch'] = ((df_61['id'])/100).astype(int)

In [12]:
df_61['batch'].value_counts()

0    100
1    100
2    100
3     59
Name: batch, dtype: int64

In [7]:
# parse pdf
for i in range(0,4): # 0-3
    globals()[f'df_6.1_{i}'] = parse_pdf(tqdm(df_61[df_61['batch']==i]['pdf_file']))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art6.1_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_6.1_{i}'].to_json(file_name)

100%|██████████| 100/100 [10:03<00:00,  6.03s/it]
100%|██████████| 100/100 [10:03<00:00,  6.03s/it]


Batch 0 completed


100%|██████████| 100/100 [13:46<00:00,  8.27s/it]
100%|██████████| 100/100 [13:46<00:00,  8.27s/it]


Batch 1 completed


100%|██████████| 100/100 [12:28<00:00,  7.48s/it]
100%|██████████| 100/100 [12:28<00:00,  7.48s/it]


Batch 2 completed


100%|██████████| 59/59 [06:51<00:00,  6.97s/it]
100%|██████████| 59/59 [06:51<00:00,  6.97s/it]

Batch 3 completed





#### Article 6(b)_uc (5734 cases)

In [11]:
# create dataframe
df_60 = pd.DataFrame(art6_0_pdf_list, columns = ['pdf_file'])

df_60['id'] = df_60.index
df_60['batch'] = ((df_60['id'])/100).astype(int)

In [12]:
df_60.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4908 entries, 0 to 4907
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   pdf_file  4908 non-null   object
 1   id        4908 non-null   int64 
 2   batch     4908 non-null   int32 
dtypes: int32(1), int64(1), object(1)
memory usage: 96.0+ KB


In [13]:
df_60['batch'].value_counts()

0     100
37    100
27    100
28    100
29    100
30    100
31    100
32    100
33    100
34    100
35    100
36    100
38    100
1     100
39    100
40    100
41    100
42    100
43    100
44    100
45    100
46    100
47    100
48    100
26    100
25    100
24    100
23    100
2     100
3     100
4     100
5     100
6     100
7     100
8     100
9     100
10    100
11    100
12    100
13    100
14    100
15    100
16    100
17    100
18    100
19    100
20    100
21    100
22    100
49      8
Name: batch, dtype: int64

In [12]:
# parse pdf
for i in range(0,50): #0-49
    globals()[f'df_6.0_{i}'] = parse_pdf(tqdm(df_60[df_60['batch']==i]['pdf_file']))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art6.0_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_6.0_{i}'].to_json(file_name)

100%|██████████| 100/100 [01:25<00:00,  1.18it/s]
100%|██████████| 100/100 [01:25<00:00,  1.18it/s]


Batch 0 completed


100%|██████████| 100/100 [01:38<00:00,  1.02it/s]
100%|██████████| 100/100 [01:38<00:00,  1.02it/s]


Batch 1 completed


100%|██████████| 100/100 [01:04<00:00,  1.55it/s]
100%|██████████| 100/100 [01:04<00:00,  1.55it/s]


Batch 2 completed


100%|██████████| 100/100 [03:37<00:00,  2.17s/it]
100%|██████████| 100/100 [03:36<00:00,  2.17s/it]


Batch 3 completed


100%|██████████| 100/100 [02:26<00:00,  1.46s/it]
100%|██████████| 100/100 [02:26<00:00,  1.46s/it]


Batch 4 completed


100%|██████████| 100/100 [02:50<00:00,  1.70s/it]
100%|██████████| 100/100 [02:50<00:00,  1.70s/it]


Batch 5 completed


100%|██████████| 100/100 [02:21<00:00,  1.41s/it]
100%|██████████| 100/100 [02:21<00:00,  1.41s/it]


Batch 6 completed


100%|██████████| 100/100 [01:32<00:00,  1.08it/s]
100%|██████████| 100/100 [01:32<00:00,  1.08it/s]


Batch 7 completed


100%|██████████| 100/100 [01:28<00:00,  1.13it/s]
100%|██████████| 100/100 [01:28<00:00,  1.13it/s]


Batch 8 completed


100%|██████████| 100/100 [00:44<00:00,  2.25it/s]
100%|██████████| 100/100 [00:44<00:00,  2.25it/s]


Batch 9 completed


100%|██████████| 100/100 [00:52<00:00,  1.92it/s]
100%|██████████| 100/100 [00:52<00:00,  1.92it/s]


Batch 10 completed


100%|██████████| 100/100 [01:45<00:00,  1.06s/it]
100%|██████████| 100/100 [01:45<00:00,  1.06s/it]


Batch 11 completed


100%|██████████| 100/100 [00:58<00:00,  1.70it/s]
100%|██████████| 100/100 [00:58<00:00,  1.70it/s]


Batch 12 completed


100%|██████████| 100/100 [01:00<00:00,  1.65it/s]
100%|██████████| 100/100 [01:00<00:00,  1.65it/s]


Batch 13 completed


100%|██████████| 100/100 [00:55<00:00,  1.79it/s]
100%|██████████| 100/100 [00:55<00:00,  1.79it/s]


Batch 14 completed


100%|██████████| 100/100 [02:10<00:00,  1.30s/it]
100%|██████████| 100/100 [02:10<00:00,  1.30s/it]


Batch 15 completed


100%|██████████| 100/100 [00:56<00:00,  1.76it/s]
100%|██████████| 100/100 [00:56<00:00,  1.76it/s]


Batch 16 completed


100%|██████████| 100/100 [00:55<00:00,  1.81it/s]
100%|██████████| 100/100 [00:55<00:00,  1.81it/s]


Batch 17 completed


100%|██████████| 100/100 [00:59<00:00,  1.68it/s]
100%|██████████| 100/100 [00:59<00:00,  1.68it/s]


Batch 18 completed


100%|██████████| 100/100 [01:49<00:00,  1.09s/it]
100%|██████████| 100/100 [01:49<00:00,  1.09s/it]


Batch 19 completed


100%|██████████| 100/100 [01:04<00:00,  1.54it/s]
100%|██████████| 100/100 [01:04<00:00,  1.54it/s]


Batch 20 completed


100%|██████████| 100/100 [00:58<00:00,  1.71it/s]
100%|██████████| 100/100 [00:58<00:00,  1.71it/s]


Batch 21 completed


100%|██████████| 100/100 [01:00<00:00,  1.66it/s]
100%|██████████| 100/100 [01:00<00:00,  1.66it/s]


Batch 22 completed


 41%|████      | 41/100 [00:45<01:05,  1.11s/it]
 41%|████      | 41/100 [00:45<01:05,  1.11s/it]


PSEOF: Unexpected EOF

In [16]:
# parse pdf
for i in range(23,50): #0-49
    globals()[f'df_6.0_{i}'] = parse_pdf(tqdm(df_60[df_60['batch']==i]['pdf_file']))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art6.0_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_6.0_{i}'].to_json(file_name)

 41%|████      | 41/100 [00:45<00:45,  1.30it/s]

Skipping corrupted PDF file:  [Errno 2] No such file or directory: '../../../data/pdfs/art6.0\\M.10261\\m10261_55_3.pdf'


100%|██████████| 100/100 [01:39<00:00,  1.00it/s]
100%|██████████| 100/100 [01:39<00:00,  1.00it/s]


Batch 23 completed


100%|██████████| 100/100 [01:25<00:00,  1.16it/s]
100%|██████████| 100/100 [01:25<00:00,  1.16it/s]


Batch 24 completed


100%|██████████| 100/100 [01:05<00:00,  1.52it/s]
100%|██████████| 100/100 [01:05<00:00,  1.52it/s]


Batch 25 completed


100%|██████████| 100/100 [01:27<00:00,  1.14it/s]
100%|██████████| 100/100 [01:27<00:00,  1.14it/s]


Batch 26 completed


100%|██████████| 100/100 [01:47<00:00,  1.07s/it]
100%|██████████| 100/100 [01:47<00:00,  1.07s/it]


Batch 27 completed


100%|██████████| 100/100 [00:55<00:00,  1.82it/s]
100%|██████████| 100/100 [00:55<00:00,  1.82it/s]


Batch 28 completed


100%|██████████| 100/100 [01:23<00:00,  1.20it/s]
100%|██████████| 100/100 [01:23<00:00,  1.20it/s]


Batch 29 completed


100%|██████████| 100/100 [01:10<00:00,  1.41it/s]
100%|██████████| 100/100 [01:10<00:00,  1.41it/s]


Batch 30 completed


100%|██████████| 100/100 [01:28<00:00,  1.13it/s]
100%|██████████| 100/100 [01:28<00:00,  1.13it/s]


Batch 31 completed


100%|██████████| 100/100 [01:29<00:00,  1.11it/s]
100%|██████████| 100/100 [01:29<00:00,  1.11it/s]


Batch 32 completed


100%|██████████| 100/100 [01:42<00:00,  1.03s/it]
100%|██████████| 100/100 [01:42<00:00,  1.03s/it]


Batch 33 completed


100%|██████████| 100/100 [01:27<00:00,  1.14it/s]
100%|██████████| 100/100 [01:27<00:00,  1.14it/s]


Batch 34 completed


100%|██████████| 100/100 [01:53<00:00,  1.13s/it]
100%|██████████| 100/100 [01:53<00:00,  1.13s/it]


Batch 35 completed


100%|██████████| 100/100 [01:53<00:00,  1.14s/it]
100%|██████████| 100/100 [01:53<00:00,  1.14s/it]


Batch 36 completed


100%|██████████| 100/100 [01:38<00:00,  1.02it/s]
100%|██████████| 100/100 [01:38<00:00,  1.02it/s]


Batch 37 completed


100%|██████████| 100/100 [01:22<00:00,  1.21it/s]
100%|██████████| 100/100 [01:22<00:00,  1.21it/s]


Batch 38 completed


100%|██████████| 100/100 [00:55<00:00,  1.81it/s]
100%|██████████| 100/100 [00:55<00:00,  1.81it/s]


Batch 39 completed


100%|██████████| 100/100 [01:34<00:00,  1.06it/s]
100%|██████████| 100/100 [01:34<00:00,  1.06it/s]


Batch 40 completed


100%|██████████| 100/100 [01:04<00:00,  1.56it/s]
100%|██████████| 100/100 [01:04<00:00,  1.56it/s]


Batch 41 completed


100%|██████████| 100/100 [01:30<00:00,  1.11it/s]
100%|██████████| 100/100 [01:30<00:00,  1.11it/s]


Batch 42 completed


100%|██████████| 100/100 [01:48<00:00,  1.08s/it]
100%|██████████| 100/100 [01:48<00:00,  1.08s/it]


Batch 43 completed


100%|██████████| 100/100 [02:31<00:00,  1.52s/it]
100%|██████████| 100/100 [02:31<00:00,  1.52s/it]


Batch 44 completed


100%|██████████| 100/100 [01:13<00:00,  1.37it/s]
100%|██████████| 100/100 [01:13<00:00,  1.37it/s]


Batch 45 completed


100%|██████████| 100/100 [02:12<00:00,  1.33s/it]
100%|██████████| 100/100 [02:12<00:00,  1.33s/it]


Batch 46 completed


100%|██████████| 100/100 [01:36<00:00,  1.04it/s]
100%|██████████| 100/100 [01:36<00:00,  1.04it/s]


Batch 47 completed


100%|██████████| 100/100 [01:50<00:00,  1.10s/it]
100%|██████████| 100/100 [01:50<00:00,  1.10s/it]


Batch 48 completed


100%|██████████| 9/9 [00:06<00:00,  1.49it/s]
100%|██████████| 9/9 [00:06<00:00,  1.49it/s]

Batch 49 completed





In [91]:
df_60.tail

<bound method NDFrame.tail of                                                pdf_file    id  batch
0     ../../../data/pdfs/art6.0\M.10000\m10000_41_3.pdf     0      0
1     ../../../data/pdfs/art6.0\M.10001\m10001_438_3...     1      0
2     ../../../data/pdfs/art6.0\M.10003\m10003_69_3.pdf     2      0
3     ../../../data/pdfs/art6.0\M.10004\m10004_67_3.pdf     3      0
4     ../../../data/pdfs/art6.0\M.10005\m10005_109_3...     4      0
...                                                 ...   ...    ...
1242    ../../../data/pdfs/art6.0\M.9994\m9994_81_3.pdf  1242     12
1243   ../../../data/pdfs/art6.0\M.9995\m9995_148_3.pdf  1243     12
1244   ../../../data/pdfs/art6.0\M.9996\m9996_114_3.pdf  1244     12
1245    ../../../data/pdfs/art6.0\M.9998\m9998_51_3.pdf  1245     12
1246   ../../../data/pdfs/art6.0\M.9999\m9999_110_3.pdf  1246     12

[1247 rows x 3 columns]>

In [15]:
df_60[df_60['id']==2341]

Unnamed: 0,pdf_file,id,batch
2341,../../../data/pdfs/art6.0\M.10261\m10261_55_3.pdf,2341,23


In [108]:
df_60[df_60['pdf_file']=="../../../data/pdfs/art6.0\M.9935\m9935_130_3.pdf"]

Unnamed: 0,pdf_file,id,batch
1199,../../../data/pdfs/art6.0\M.9935\m9935_130_3.pdf,1199,11


#### Article 9.3 (129 cases)

In [6]:
# create dataframe
df_93 = pd.DataFrame(art9_3_pdf_list, columns = ['pdf_file'])

df_93['id'] = df_93.index
df_93['batch'] = ((df_93['id'])/100).astype(int)

In [7]:
df_93['batch'].value_counts()

0    100
1     93
Name: batch, dtype: int64

In [9]:
# parse pdf
for i in range(1,2): #0-1
    globals()[f'df_9.3_{i}'] = parse_pdf(tqdm(df_93[df_93['batch']==i]['pdf_file']))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art9.3_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_9.3_{i}'].to_json(file_name)

100%|██████████| 93/93 [09:54<00:00,  6.39s/it]
100%|██████████| 93/93 [09:54<00:00,  6.39s/it]

Batch 1 completed





# Append all dataframes

In [17]:
import glob
import os 

path = f"../../../data/parsed/*.json"
files = glob.glob(path)

In [18]:
data_merged = pd.concat([pd.read_json(f) for f in files])

In [19]:
data_merged=data_merged.reset_index(drop=True)

In [20]:
data_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5635 entries, 0 to 5634
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   article   5635 non-null   object
 1   case_num  5635 non-null   object
 2   filename  5635 non-null   object
 3   text      5635 non-null   object
 4   lang      5635 non-null   object
dtypes: object(5)
memory usage: 220.2+ KB


In [21]:
data_merged['article'].value_counts()

art6.0    4908
art6.1     359
art9.3     193
art8.2     114
art8.1      47
art8.3      14
Name: article, dtype: int64

In [22]:
# save json file name
date = datetime.date.today().strftime('%Y_%m_%d')

file_name = f"../../../data/processed/data_merged_{date}.json"
if os.path.exists(file_name):
    os.remove(file_name)

# save file as json
data_merged.to_json(file_name)

In [23]:
data_merged['lang'].value_counts()

en    5623
fr       5
de       4
sv       2
es       1
Name: lang, dtype: int64

In [24]:
data_merged.head()

Unnamed: 0,article,case_num,filename,text,lang
0,art6.0,M.9004,art6.0\M.9004\m9004_65_3,\nEUROPEAN COMMISSION \nDG Competition \n \n ...,en
1,art6.0,M.9001,art6.0\M.9001\m9001_54_3,\nEUROPEAN COMMISSION \nDG Competition \n \n ...,en
2,art6.0,M.9000,art6.0\M.9000\m9000_104_3,\n \nEUROPEAN COMMISSION \nDG Competition \n ...,en
3,art6.0,M.8999,art6.0\M.8999\m8999_63_8,\nEUROPEAN COMMISSION \nDG Competition \n \n ...,en
4,art6.0,M.8998,art6.0\M.8998\m8998_71_3,\nEUROPEAN COMMISSION \nDG Competition \n \n ...,en


In [25]:
data_merged[data_merged['case_num'] == "M.10261"]

Unnamed: 0,article,case_num,filename,text,lang


In [26]:
data_merged['case_num'][1244]

'M.5183'

In [27]:
print(data_merged['text'][1246])

EN
Case No COMP/M.5181 (cid:150)
Delta Air Lines/ Northwest
Airlines
Only the English text is available and authentic.
REGULATION (EC) No 139/2004
MERGER PROCEDURE
Article 6(1)(b) NON-OPPOSITION
Date: 06/08/2008
In electronic form on the EUR-Lex website under document
number 32008M5181
Office for Official Publications of the European Communities
L-2985 Luxembourg COMMISSION OF THE EUROPEAN COMMUNITIES
Brussels, 06-VIII-2008
SG-Greffe (2008) D/205034
C(2008) 4359
In the published version of this decision, some PUBLIC VERSION
information has been omitted pursuant to Article
17(2) of Council Regulation (EC) No 139/2004
concerning  non-disclosure  of  business  secrets
and  other  confidential  information.  The
MERGER PROCEDURE
omissions are shown thus [(cid:133)]. Where possible
the information omitted has been replaced by ARTICLE 6(1)(b) DECISION
ranges of figures or a general description.
To the notifying party:
Dear Sir/ Madam,
Subject: Case  No  COMP/M.5181  (cid:150)  Delta  Air  Li