## Software Requirements

In [None]:
!conda env create -f environment.yml

In [None]:
!conda activate thesis

In [1]:
# Data visualization
import matplotlib.pyplot as plt 

# Data manipulation
import pandas as pd
import numpy as np
import csv
from zipfile import ZipFile

In [2]:
# Webscraping
import glob
import requests
from bs4 import BeautifulSoup
import time
import datetime
from pandas.core.common import flatten
import os
from itertools import chain
from tqdm import tqdm
import json
import urllib.request

In [3]:
# Parsing and pre-processing
from glob import glob
import os 
import re

from pdfminer.high_level import extract_text
import pdfplumber
from langdetect import detect, DetectorFactory

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [5]:
# Vector representations and embeddings
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim

In [6]:
# Logistic and XGboost
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score
from xgboost import XGBClassifier
import pickle

In [7]:
# LSTM 
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# BERT models
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler
import transformers
from transformers import AutoModel, BertTokenizerFast

In [9]:
# specify GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

## Data Pre-processing

In [635]:
df = pd.read_json(r"../../../data/processed/data_merged_2023_02_01.json")

In [636]:
df['lang'].value_counts()

en    5623
fr       5
de       4
sv       2
es       1
Name: lang, dtype: int64

### Subset English merger decisions

In [637]:
df = df[df['lang']=="en"]
df = df.reset_index(drop=True)

In [638]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5623 entries, 0 to 5622
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   article   5623 non-null   object
 1   case_num  5623 non-null   object
 2   filename  5623 non-null   object
 3   text      5623 non-null   object
 4   lang      5623 non-null   object
dtypes: object(5)
memory usage: 219.8+ KB


In [639]:
# define regular expression pattern
pattern = r"\\([^\\]*)$"

# apply regular expression to each row in "filename" column
df['file'] = df['filename'].apply(lambda x: re.findall(pattern, x)[0])

In [640]:
df.drop_duplicates(subset='file', keep='first', inplace=True)
df=df.reset_index(drop=True)

In [641]:
df['id'] = df.index

In [642]:
df['article'] = df['article'].replace('art6.1', 'article6(2)')
df['article'] = df['article'].replace('art6.0', 'article6(1)(b)')
df['article'] = df['article'].replace('art8.1', 'article8(1)')
df['article'] = df['article'].replace('art8.2', 'article8(2)')
df['article'] = df['article'].replace('art8.3', 'article8(3)')
df['article'] = df['article'].replace('art9.3', 'referral')

In [643]:
# no of cases by article
df.groupby('article')['case_num'].nunique()

article
article6(1)(b)    4831
article6(2)        224
article8(1)         36
article8(2)         70
article8(3)         11
referral            24
Name: case_num, dtype: int64

In [644]:
# no of documents by article
df['article'].value_counts().sort_index()

article6(1)(b)    4899
article6(2)        358
article8(1)         47
article8(2)        111
article8(3)         14
referral            29
Name: article, dtype: int64

In [645]:
df[df['case_num']=="M.10431"][['article', 'filename', 'case_num']]

Unnamed: 0,article,filename,case_num
4986,article6(2),art6.1\M.10431\M_10431_8659694_2665_3,M.10431
4987,article6(2),art6.1\M.10431\M_10431_8414584_2641_3,M.10431


In [646]:
# no of documents by article
df['article'].value_counts().sort_index()

article6(1)(b)    4899
article6(2)        358
article8(1)         47
article8(2)        111
article8(3)         14
referral            29
Name: article, dtype: int64

In [647]:
df.text[0]

" \nEUROPEAN COMMISSION \nDG Competition \n \n \n \n \n  Case  M.9004  -  SL04  / \nAMBIENTA SGR / JV \n \n \n \n \nOnly the English text is available and authentic. \n \n \n \nREGULATION (EC) No 139/2004 \nMERGER PROCEDURE \n \n \n \nArticle 6(1)(b) NON-OPPOSITION \nDate: 09/08/2018 \n \n \n \n \n \n \n \n \nIn electronic form on the EUR-Lex website under document \nnumber 32018M9004 \n \n \n   \nEUROPEAN COMMISSION \n \nBrussels, 09.08.2018 \nC(2018) 5532 final \nPUBLIC VERSION \n \n \nTo the notifying parties \n \nSubject:  Case M.9004 – SL04/Ambienta Sgr/JV  \nCommission decision pursuant to Article 6(1)(b) of Council Regulation (EC) \n1 2\nNo 139/2004  and Article 57 of the Agreement on the European Economic Area   \nDear Sir or Madam, \n1.  On  18  July  2018,  the  European  Commission  received  notification  of  a  proposed \nconcentration pursuant to Article 4 of the Merger Regulation by which the undertakings \nS.L.04  S.à.r.l.  (‘SL04’,  Luxembourg),  ultimately  controlled

### Clean labels

In [648]:
#### TODO: regex sections depending on article

In [649]:
def article_match(txt):
    match = re.search(r"(?i)article\s*\d+\s*(\([^\)]+\))?\s*(\([^\)]+\))?", txt) 
    if match:
        first_match = match.group()
        return first_match.replace(" ", "").replace("\t", "").replace("\n", "").lower()
    else:
        return "None"

In [650]:
def article62(txt):
    match = re.search(r"(?i)IN\s+CONJUNCTION\s+WITH\s+ART(?:ICLE)?\s+\d+\(\d+\)", txt)  
    if match:
        first_match = match.group()
        return first_match.replace(" ", "").replace("\t", "").replace("\n", "").lower()
    else:
        return "None"


In [651]:
df['article_txt'] = df['text'].apply(article_match)

In [652]:
df['article_txt'] = df['article_txt'].replace("article6(1)", "article6(2)") # change to 6.2

In [653]:
df['article_62'] = df['text'].apply(article62)

In [654]:
df['article_62'].value_counts()

None                             5246
inconjunctionwithart6(2)          154
inconjunctionwitharticle6(2)       51
inconjunctionwitharticle6(1)        2
inconjunctionwitharticle22(3)       1
inconjunctionwithart3(4)            1
inconjunctionwitharticle3(4)        1
inconjunctionwitharticle4(1)        1
inconjunctionwitharticle18(4)       1
Name: article_62, dtype: int64

In [655]:
df['article_txt'] = np.where(df['article_62'].isin(['inconjunctionwitharticle6(2)', 'inconjunctionwithart6(2)']), 'article6(2)', df['article_txt'])

In [656]:
df['article_txt'].value_counts().sort_index()

None                 1
article11(1)         1
article17(2)        99
article21            1
article22           12
article22(1)         1
article22(3)        42
article232           1
article4             1
article4(4)         16
article4(5)          1
article57            2
article6(1)(a)       1
article6(1)(b)    4834
article6(2)        257
article6(4)          1
article7(3)         32
article8(1)         36
article8(2)         75
article8(3)         11
article8(4)          1
article9             3
article9(2)          1
article9(3)         26
article9(3)(b)       2
Name: article_txt, dtype: int64

In [657]:
# is referral if article == referral and not in 5 categories
mask = (df['article_txt'].isin(["article4(4)", "article22(3)", "article22", "article9(3)", "article9(3)(b)"]))
df.loc[mask, 'article_txt'] = "referral"

In [658]:
df['article_txt'].value_counts().sort_index()

None                 1
article11(1)         1
article17(2)        99
article21            1
article22(1)         1
article232           1
article4             1
article4(5)          1
article57            2
article6(1)(a)       1
article6(1)(b)    4834
article6(2)        257
article6(4)          1
article7(3)         32
article8(1)         36
article8(2)         75
article8(3)         11
article8(4)          1
article9             3
article9(2)          1
referral            98
Name: article_txt, dtype: int64

In [659]:
#TODO - retain decision with commitments document same article_txt as the first tag of the case num
# article_txt article17(2), 7(3) change to article

#17(2) get second instance

df.loc[df['article_txt'] == 'article7(3)', 'article_txt'] = df['article']
df.loc[df['article_txt'] == 'article17(2)', 'article_txt'] = df['article']

In [660]:
# subset based on article_txt
df1 = df[df['article_txt'].isin(["article6(1)(b)", "article6(2)", "article8(1)", "article8(2)", "article8(3)", "referral"])]

In [661]:
# no of cases by article
df1.groupby('article_txt')['case_num'].nunique()

article_txt
article6(1)(b)    4837
article6(2)        224
article8(1)         36
article8(2)         70
article8(3)         11
referral            62
Name: case_num, dtype: int64

In [662]:
# no of documents by article
df1['article_txt'].value_counts().sort_index()

article6(1)(b)    4865
article6(2)        333
article8(1)         37
article8(2)         96
article8(3)         11
referral           100
Name: article_txt, dtype: int64

In [663]:
df1['article_match'] = df1['article_txt'] == df1['article']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['article_match'] = df1['article_txt'] == df1['article']


In [664]:
df1['article_match'].value_counts()

True     5355
False      87
Name: article_match, dtype: int64

In [665]:
df1[df1['article_match']==False]['article_txt'].value_counts()

referral          76
article6(1)(b)     8
article6(2)        2
article8(2)        1
Name: article_txt, dtype: int64

In [666]:
df1[(df1['article_match'] == False) & (df1['article_txt'] == "article6(1)(b)")]

Unnamed: 0,article,case_num,filename,text,lang,file,id,article_txt,article_62,article_match
4907,article6(2),M.5364,art6.1\M.5364\m5364_620_5,\nEUROPEAN COMMISSION \nDG Competition \n \n ...,en,m5364_620_5,4907,article6(1)(b),,False
4922,article6(2),M.5020,art6.1\M.5020\m5020_20080711_20212_en,EN\nCase No COMP/M.5020 -\nLESAFFRE / GBI UK\n...,en,m5020_20080711_20212_en,4922,article6(1)(b),,False
4928,article6(2),M.4844,art6.1\M.4844\m4844_20071003_20212_en,Case No COMP/M.4844 - FORTIS / ABN AMRO ASSETS...,en,m4844_20071003_20212_en,4928,article6(1)(b),,False
5144,article6(2),M.8465,art6.1\M.8465\m8465_894_5,\nEUROPEAN COMMISSION \nDG Competition \n \n ...,en,m8465_894_5,5144,article6(1)(b),,False
5166,article6(2),M.8130,art6.1\M.8130\m8130_1247_5,\nEUROPEAN COMMISSION \nDG Competition \n \n ...,en,m8130_1247_5,5166,article6(1)(b),,False
5196,article6(2),M.7792,art6.1\M.7792\m7792_2313_3,EUROPEAN COMMISSION \nDG Competition \n \n \n ...,en,m7792_2313_3,5196,article6(1)(b),,False
5215,article6(2),M.9677,art6.1\M.9677\M_9677_8149323_3017_3,\n \nEUROPEAN COMMISSION \nDG Competition \n ...,en,M_9677_8149323_3017_3,5215,article6(1)(b),,False
5457,referral,M.8562,art9.3\M.8562\m8562_220_3,\n \nEUROPEAN COMMISSION \nDG Competition \n ...,en,m8562_220_3,5457,article6(1)(b),,False


In [667]:
df1.loc[(df1['article_match'] == False) & (df1['article_txt'] == "article6(1)(b)") & (df1['article'] == "article6(2)"), 'article_txt'] = "article6(2)"

In [668]:
# no of cases by article
df1.groupby('article_txt')['case_num'].nunique()

article_txt
article6(1)(b)    4830
article6(2)        226
article8(1)         36
article8(2)         70
article8(3)         11
referral            62
Name: case_num, dtype: int64

In [669]:
# no of documents by article
df1['article_txt'].value_counts().sort_index()

article6(1)(b)    4858
article6(2)        340
article8(1)         37
article8(2)         96
article8(3)         11
referral           100
Name: article_txt, dtype: int64

In [671]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5442 entries, 0 to 5457
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   article        5442 non-null   object
 1   case_num       5442 non-null   object
 2   filename       5442 non-null   object
 3   text           5442 non-null   object
 4   lang           5442 non-null   object
 5   file           5442 non-null   object
 6   id             5442 non-null   int64 
 7   article_txt    5442 non-null   object
 8   article_62     5442 non-null   object
 9   article_match  5442 non-null   bool  
dtypes: bool(1), int64(1), object(8)
memory usage: 430.5+ KB


In [672]:
# # save json file name
# date = datetime.date.today().strftime('%Y_%m_%d')

# file_name = f"../../../data/processed/df1_{date}.json"
# if os.path.exists(file_name):
#     os.remove(file_name)

# # save file as json
# df1.to_json(file_name)

### Extract specific sections

In [None]:
# TODO

### Removal of stopwords, punctuations, numeric characters

In [16]:
# nltk.download("stopwords")
# from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# TODO: retain digits, drop repeating terms based on TFIDF

In [675]:
def preprocess_corpus(texts):
    eng_stopwords = set(stopwords.words("english"))
    def remove_stops_digits(tokens):
        token_list =  [token.lower() for token in tokens if token not in eng_stopwords and token not in punctuation and token.isdigit() == False]
        processed_text = ' '.join(token_list)
        return processed_text
    return [remove_stops_digits(word_tokenize(text)) for text in texts]

In [677]:
df1['text_clean'] = preprocess_corpus(df1['text'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['text_clean'] = preprocess_corpus(df1['text'])


In [678]:
df1['text_clean'][1]

"european commission dg competition case m.9001 kuehne nagel temasek jv only english text available authentic regulation ec no 139/2004 merger procedure article b non-opposition date 24/07/2018 in electronic form eur-lex website document number 32018m9001 european commission brussels,24.7.2018 c final public version to notifying parties subject case m.9001 kuehne nagel/temasek/jv commission decision pursuant article b council regulation ec no 139/2004 article agreement european economic area dear sir madam on june european commission received notification proposed concentration pursuant article merger regulation kuehne nagel management ag `` k+n '' switzerland temasek holdings private limited `` temasek '' singapore acquire within meaning article b merger regulation joint control newly created joint venture `` jv '' singapore way purchase shares.3 the business activities undertakings concerned \uf02d k+n globally active logistics company main activities sea freight airfreight overland 

### Stemming and lemmatization

In [679]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [680]:
def stem_lemmatize(text):
    stemmed = [stemmer.stem(token) for token in word_tokenize(text)]
    lemmatized = [lemmatizer.lemmatize(token) for token in stemmed]
    processed_text = ' '.join(lemmatized)
    return processed_text

In [681]:
df1['text_clean'] = [stem_lemmatize(text) for text in df1['text_clean']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['text_clean'] = [stem_lemmatize(text) for text in df1['text_clean']]


In [682]:
df1['text_clean'][1]

"european commiss dg competit case m.9001 kuehn nagel temasek jv onli english text avail authent regul ec no 139/2004 merger procedur articl b non-opposit date 24/07/2018 in electron form eur-lex websit document number 32018m9001 european commiss brussels,24.7.2018 c final public version to notifi parti subject case m.9001 kuehn nagel/temasek/jv commiss decis pursuant articl b council regul ec no 139/2004 articl agreement european econom area dear sir madam on june european commiss receiv notif propos concentr pursuant articl merger regul kuehn nagel manag ag `` k+n `` switzerland temasek hold privat limit `` temasek `` singapor acquir within mean articl b merger regul joint control newli creat joint ventur `` jv `` singapor way purchas shares.3 the busi activ undertak concern \uf02d k+n global activ logist compani main activ sea freight airfreight overland forward well contract logist \uf02d temasek invest compani broad rang portfolio invest includ financi servic telecommun medium rea

In [683]:
# # save json file name
# date = datetime.date.today().strftime('%Y_%m_%d')

# file_name = f"../../../data/processed/pre-processed_0_{date}.json"
# if os.path.exists(file_name):
#     os.remove(file_name)

# # save file as json
# df1.to_json(file_name)

### Coreference resolution

In [2]:
# installing neuralcoref from source
#!git clone https://github.com/huggingface/neuralcoref.git
#!cd "D:\Desktop\Thesis\predicting-merger-decision-outcomes\src\python\notebook\neuralcoref"
# !pip install -r requirements.txt
# !pip install -e .
# !pip install spacy
# !pip install -U neuralcoref

In [687]:
import neuralcoref
import spacy

ModuleNotFoundError: No module named 'neuralcoref'

In [688]:
nlp = spacy.load('en_core_web_lg') 
neuralcoref.add_to_pipe(nlp)

KeyboardInterrupt: 

In [684]:
import spacy
nlp = spacy.load('en')

OSError: [E050] Can't find model 'en'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [None]:
def coref_res(texts):
    doc = nlp(texts)
    clean = doc._.coref_resolved
    return clean

df['text_clean'] = [coref_res(text) for text in df['text_clean']]

### Drop based on TFIDF

In [None]:
#TODO: remove highly repeating words

### Language detection

In [None]:
for index, row in df.iterrows():
    df.at[index, 'language'] = detect(df.at[index, 'text_clean'])

df['language'].value_counts()

In [None]:
df = df[df['language']=="en"]