In [53]:
# Data visualization
import matplotlib.pyplot as plt 

# Data manipulation
import pandas as pd
import numpy as np
import csv
from zipfile import ZipFile

In [54]:
# Webscraping
import glob
import requests
from bs4 import BeautifulSoup
import time
import datetime
from pandas.core.common import flatten
import os
from itertools import chain
from tqdm import tqdm
import json
import urllib.request

In [55]:
# Parsing and pre-processing
from glob import glob
import os 
import re

from pdfminer.high_level import extract_text
import pdfplumber
from langdetect import detect, DetectorFactory

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [56]:
import sys
sys.path.append(f'../../python')
from scraper import createFolder, get_merger_links, download_pdf, get_merger_info, parse_pdf

#### Parsing merger decisions PDFs to text

In [80]:
pdf_dir = f"../../../data/pdfs/"
art8_1_pdf_list = glob(os.path.join(pdf_dir, "art8.1/*/*.pdf")) #36
art8_1_pdf_list.sort(key=lambda x: os.path.getctime(x))

art8_2_pdf_list = glob(os.path.join(pdf_dir, "art8.2/*/*.pdf")) #72
art8_2_pdf_list.sort(key=lambda x: os.path.getctime(x))

art8_3_pdf_list = glob(os.path.join(pdf_dir, "art8.3/*/*.pdf")) #14
art8_3_pdf_list.sort(key=lambda x: os.path.getctime(x))

art6_1_pdf_list = glob(os.path.join(pdf_dir, "art6.1/*/*.pdf"))
art6_1_pdf_list.sort(key=lambda x: os.path.getctime(x))

art6_0_pdf_list = glob(os.path.join(pdf_dir, "art6.0/*/*.pdf"))
art6_0_pdf_list.sort(key=lambda x: os.path.getctime(x))


#### Article 8.1 (36 cases)

In [14]:
df1=parse_pdf(art8_1_pdf_list)

100%|██████████| 47/47 [08:59<00:00, 11.48s/it]


In [15]:
# save json file name
date = datetime.date.today().strftime('%Y_%m_%d')

file_name = f"../../../data/parsed/parsed_art8.1_{date}.json"
if os.path.exists(file_name):
    os.remove(file_name)

# save file as json
df1.to_json(file_name)

#### Article 8.2 (73 cases)

In [33]:
# create dataframe
df_82 = pd.DataFrame(art8_2_pdf_list, columns = ['pdf_file'])

df_82['id'] = df_82.index
df_82['batch'] = ((df_82['id'])/100).astype(int)

In [34]:
df_82['batch'].value_counts()

0    100
1     14
Name: batch, dtype: int64

In [35]:
# parse pdf
for i in range(0,2): # 0-1
    globals()[f'df_8.2_{i}'] = parse_pdf(tqdm(df_82[df_82['batch']==i]['pdf_file']))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art8.2_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_8.2_{i}'].to_json(file_name)

100%|██████████| 100/100 [42:58<00:00, 25.78s/it]
100%|██████████| 100/100 [42:58<00:00, 25.78s/it]


Batch 0 completed


100%|██████████| 14/14 [11:07<00:00, 47.66s/it]
100%|██████████| 14/14 [11:07<00:00, 47.66s/it]

Batch 1 completed





#### Article 8.3 (13 cases)

In [6]:
df3 = parse_pdf(art8_3_pdf_list)

100%|██████████| 14/14 [11:28<00:00, 49.21s/it]


In [13]:
# save json file name
date = datetime.date.today().strftime('%Y_%m_%d')

file_name = f"../../../data/parsed/parsed_art8.3_{date}.json"
if os.path.exists(file_name):
    os.remove(file_name)

# save file as json
df3.to_json(file_name)

#### Article 6.1(b)_cc (240 cases)

In [6]:
# create dataframe
df_61 = pd.DataFrame(art6_1_pdf_list, columns = ['pdf_file'])

df_61['id'] = df_61.index
df_61['batch'] = ((df_61['id'])/100).astype(int)

In [12]:
df_61['batch'].value_counts()

0    100
1    100
2    100
3     59
Name: batch, dtype: int64

In [7]:
# parse pdf
for i in range(0,4): # 0-3
    globals()[f'df_6.1_{i}'] = parse_pdf(tqdm(df_61[df_61['batch']==i]['pdf_file']))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art6.1_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_6.1_{i}'].to_json(file_name)

100%|██████████| 100/100 [10:03<00:00,  6.03s/it]
100%|██████████| 100/100 [10:03<00:00,  6.03s/it]


Batch 0 completed


100%|██████████| 100/100 [13:46<00:00,  8.27s/it]
100%|██████████| 100/100 [13:46<00:00,  8.27s/it]


Batch 1 completed


100%|██████████| 100/100 [12:28<00:00,  7.48s/it]
100%|██████████| 100/100 [12:28<00:00,  7.48s/it]


Batch 2 completed


100%|██████████| 59/59 [06:51<00:00,  6.97s/it]
100%|██████████| 59/59 [06:51<00:00,  6.97s/it]

Batch 3 completed





#### Article 6(b)_uc (5734 cases)

In [83]:
# create dataframe
df_60 = pd.DataFrame(art6_0_pdf_list, columns = ['pdf_file'])

df_60['id'] = df_60.index
df_60['batch'] = ((df_60['id'])/100).astype(int)

In [84]:
df_60['batch'].value_counts()

0     100
1     100
20    100
19    100
18    100
17    100
16    100
15    100
14    100
13    100
12    100
11    100
10    100
9     100
8     100
7     100
6     100
5     100
4     100
3     100
2     100
21     72
Name: batch, dtype: int64

In [85]:
# parse pdf
for i in range(0,13): #0-12
    globals()[f'df_6.0_{i}'] = parse_pdf(tqdm(df_60[df_60['batch']==i]['pdf_file']))
    print("Batch {} completed".format(i))

    # save json file name
    date = datetime.date.today().strftime('%Y_%m_%d')

    file_name = f"../../../data/parsed/parsed_art6.0_batch{i}_{date}.json"
    if os.path.exists(file_name):
        os.remove(file_name)

    # save file as json
    globals()[f'df_6.0_{i}'].to_json(file_name)

100%|██████████| 100/100 [01:26<00:00,  1.15it/s]
100%|██████████| 100/100 [01:26<00:00,  1.15it/s]


Batch 7 completed


100%|██████████| 100/100 [04:03<00:00,  2.44s/it]
100%|██████████| 100/100 [04:03<00:00,  2.44s/it]


Batch 8 completed


100%|██████████| 100/100 [02:35<00:00,  1.55s/it]
100%|██████████| 100/100 [02:35<00:00,  1.55s/it]


Batch 9 completed


100%|██████████| 100/100 [02:04<00:00,  1.24s/it]
100%|██████████| 100/100 [02:04<00:00,  1.24s/it]

Batch 10 completed





In [91]:
df_60.tail

<bound method NDFrame.tail of                                                pdf_file    id  batch
0     ../../../data/pdfs/art6.0\M.10000\m10000_41_3.pdf     0      0
1     ../../../data/pdfs/art6.0\M.10001\m10001_438_3...     1      0
2     ../../../data/pdfs/art6.0\M.10003\m10003_69_3.pdf     2      0
3     ../../../data/pdfs/art6.0\M.10004\m10004_67_3.pdf     3      0
4     ../../../data/pdfs/art6.0\M.10005\m10005_109_3...     4      0
...                                                 ...   ...    ...
1242    ../../../data/pdfs/art6.0\M.9994\m9994_81_3.pdf  1242     12
1243   ../../../data/pdfs/art6.0\M.9995\m9995_148_3.pdf  1243     12
1244   ../../../data/pdfs/art6.0\M.9996\m9996_114_3.pdf  1244     12
1245    ../../../data/pdfs/art6.0\M.9998\m9998_51_3.pdf  1245     12
1246   ../../../data/pdfs/art6.0\M.9999\m9999_110_3.pdf  1246     12

[1247 rows x 3 columns]>

In [102]:
df_60[df_60['id']==1200]

Unnamed: 0,pdf_file,id,batch
1200,../../../data/pdfs/art6.0\M.9936\m9936_441_3.pdf,1200,12


In [108]:
df_60[df_60['pdf_file']=="../../../data/pdfs/art6.0\M.9935\m9935_130_3.pdf"]

Unnamed: 0,pdf_file,id,batch
1199,../../../data/pdfs/art6.0\M.9935\m9935_130_3.pdf,1199,11


# Append all dataframes

In [39]:
import glob
import os 

path = f"../../../data/parsed/*.json"
files = glob.glob(path)

In [40]:
data_merged = pd.concat([pd.read_json(f) for f in files])

In [41]:
data_merged=data_merged.reset_index(drop=True)

In [42]:
data_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1778 entries, 0 to 1777
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   index     1778 non-null   int64 
 1   article   1778 non-null   object
 2   case_num  1778 non-null   object
 3   filename  1778 non-null   object
 4   text      1778 non-null   object
 5   lang      1778 non-null   object
dtypes: int64(1), object(5)
memory usage: 83.5+ KB


In [43]:
data_merged['article'].value_counts()

art6.0    1244
art6.1     359
art8.2     114
art8.1      47
art8.3      14
Name: article, dtype: int64

In [44]:
# save json file name
date = datetime.date.today().strftime('%Y_%m_%d')

file_name = f"../../../data/processed/data_merged_{date}.json"
if os.path.exists(file_name):
    os.remove(file_name)

# save file as json
data_merged.to_json(file_name)

In [46]:
data_merged['lang'].value_counts()

en    1773
fr       3
de       1
sv       1
Name: lang, dtype: int64

In [73]:
data_merged.head()

Unnamed: 0,index,article,case_num,filename,text,lang
0,0,art6.0,M.10568,art6.0\M.10568\M_10568_8208896_60_3,\nEUROPEAN COMMISSION \nDG Competition \n \n ...,en
1,1,art6.0,M.10566,art6.0\M.10566\M_10566_8213788_78_3,\nEUROPEAN COMMISSION \nDG Competition \n \n ...,en
2,2,art6.0,M.10565,art6.0\M.10565\M_10565_8405226_1032_3,EUROPEAN COMMISSION \nDG Competition \n \n \n ...,en
3,3,art6.0,M.10564,art6.0\M.10564\M_10564_8703987_272_6,"\nEUROPEAN COMMISSION \n \nBrussels, 17.12....",en
4,4,art6.0,M.10564,art6.0\M.10564\M_10564_8314902_264_3,\n \nEUROPEAN COMMISSION \nDG Competition \n ...,en


In [66]:
data_merged[data_merged['case_num'] == "M.9999"]

Unnamed: 0,index,article,case_num,filename,text,lang
300,0,art6.0,M.9999,art6.0\M.9999\m9999_110_3,\n \nEUROPEAN COMMISSION \nDG Competition \n ...,en


In [71]:
data_merged['case_num'][1244]

'M.5476'

In [51]:
print(data_merged['text'][1246])

EN 
 
 
  Case No COMP/M.5421 - 
PANASONIC/ SANYO 
 
 
 
 
Only the English text is available and authentic. 
 
 
 
REGULATION (EC) No 139/2004 
MERGER PROCEDURE 
 
 
 
Article 6(2) NON-OPPOSITION 
Date: 29/09/2009 
 
 
 
 
 
 
 
 
In electronic form on the EUR-Lex website under document 
number 32009M5421 
 
 
 
Office for Publications of the European Union 
L-2985 Luxembourg  EUROPEAN COMMISSION 
 
 
 
   
Brussels, 29/09/2009 
SG-Greffe(2009) D/5723 
C (209) 7572 
  In  the  published  version  of  this  decision,  some 
PUBLIC VERSION 
information has been omitted pursuant to Article 
  17(2)  of  Council  Regulation  (EC)  No  139/2004 
concerning non-disclosure of business secrets and 
MERGER PROCEDURE 
other confidential information. The omissions are 
  ARTICLE 6(1)(b) DECISION IN 
shown thus […]. Where possible the information 
  omitted has been replaced by ranges of figures or a  CONJUNCTION WITH 
general description.  ARTICLE 6(2) 
 
To the notifying party   
      
 
Dear 