## Software Requirements

In [None]:
!conda env create -f environment.yml

In [None]:
!conda activate thesis

In [1]:
# Data visualization
import matplotlib.pyplot as plt 

# Data manipulation
import pandas as pd
import numpy as np
import csv
from zipfile import ZipFile

In [2]:
# Webscraping
import glob
import requests
from bs4 import BeautifulSoup
import time
import datetime
from pandas.core.common import flatten
import os
from itertools import chain
from tqdm import tqdm
import json
import urllib.request

In [3]:
# Parsing and pre-processing
from glob import glob
import os 
import re

from pdfminer.high_level import extract_text
import pdfplumber
from langdetect import detect, DetectorFactory

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [4]:
import sys
sys.path.append(f'../../python')
from scraper import createFolder, get_merger_links, download_pdf, get_merger_info, parse_pdf

In [5]:
# Vector representations and embeddings
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim

In [6]:
# Logistic and XGboost
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score
from xgboost import XGBClassifier
import pickle

In [7]:
# LSTM 
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# BERT models
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler
import transformers
from transformers import AutoModel, BertTokenizerFast

In [9]:
# specify GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

## Data Pre-processing

In [10]:
df = pd.read_json(r"../../../data/processed/data_merged_2023_01_27.json") 
df['id'] = df.index

In [11]:
df['lang'].value_counts()

en    1773
fr       3
de       1
sv       1
Name: lang, dtype: int64

### Subset English merger decisions

In [12]:
# df[df['lang'] !="en"]
df = df[df['lang']=="en"]


In [13]:
df['article'].value_counts()

art6.0    1240
art6.1     358
art8.2     114
art8.1      47
art8.3      14
Name: article, dtype: int64

In [14]:
df.text[0]

' \nEUROPEAN COMMISSION \nDG Competition \n \n \n \n \nCase M.10568 - IIF INTERNATIONAL \nHOLDING / G+E GETEC HOLDING \n \n \n \n \nOnly the English text is available and authentic. \n \n \n \nREGULATION (EC) No 139/2004 \nMERGER PROCEDURE \n \n \n \nArticle 6(1)(b) NON-OPPOSITION \nDate: 03/03/2022 \n \n \n \n \n \n \n \n \n \n \n \nIn electronic form on the EUR-Lex website under document \nnumber 32022M10568 \n   \nEUROPEAN COMMISSION \n \nBrussels, 3.3.2022 \nC(2022) 1432 final \nPUBLIC VERSION \nIIF Int’l Holding L.P. \nUgland House, South Church Street \nKY1 1104 George Town \nCayman Islands \nSubject:  Case M.10568 – IIF INTERNATIONAL HOLDING / G+E GETEC \nHOLDING \nCommission decision pursuant to Article 6(1)(b) of Council Regulation \n(EC) No 139/20041 and Article 57 of the Agreement on the European \nEconomic Area2  \nDear Sir or Madam, \n1.  On 9 February 2022, the European Commission received notification of a proposed \nconcentration  pursuant  to  Article  4  of  the  Merg

### Extract specific sections

In [15]:
#### TODO: regex sections depending on article

### Removal of stopwords, punctuations, numeric characters

In [16]:
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
def preprocess_corpus(texts):
    eng_stopwords = set(stopwords.words("english"))
    def remove_stops_digits(tokens):
        token_list =  [token.lower() for token in tokens if token not in eng_stopwords and token not in punctuation and token.isdigit() == False]
        processed_text = ' '.join(token_list)
        return processed_text
    return [remove_stops_digits(word_tokenize(text)) for text in texts]

In [19]:
df['text_clean'] = preprocess_corpus(df['text'])

In [20]:
df['text_clean'][1]

'european commission dg competition case m.10566 p66 h2 ee jv only english text available authentic regulation ec no 139/2004 merger procedure article b non-opposition date 04/03/2022 in electronic form eur-lex website document number 32022m10566 european commission brussels 4.3.2022 c final public version phillips city west blvd houston tx united states america h2 energy europe ag hohlstrasse 186/188 zürich switzerland subject case m.10566 – p66 h2 ee jv commission decision pursuant article b council regulation ec no 139/20041 article agreement european economic area2 dear sir madam on february european commission received notification proposed concentration pursuant article merger regulation undertakings phillips company “ p66 ” united states h2 energy europe ag “ h2 ee ” switzerland acquire within meaning article b merger regulation joint control whole undertaking newco austria way purchase shares.3 the business activities concerned undertakings \uf02d p66 energy company amongst oth

### Stemming and lemmatization

In [21]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [22]:
def stem_lemmatize(text):
    stemmed = [stemmer.stem(token) for token in word_tokenize(text)]
    lemmatized = [lemmatizer.lemmatize(token) for token in stemmed]
    processed_text = ' '.join(lemmatized)
    return processed_text

In [23]:
df['text_clean'] = [stem_lemmatize(text) for text in df['text_clean']]

In [24]:
df['text_clean'][1]

'european commiss dg competit case m.10566 p66 h2 ee jv onli english text avail authent regul ec no 139/2004 merger procedur articl b non-opposit date 04/03/2022 in electron form eur-lex websit document number 32022m10566 european commiss brussel 4.3.2022 c final public version phillip citi west blvd houston tx unit state america h2 energi europ ag hohlstrass 186/188 zürich switzerland subject case m.10566 – p66 h2 ee jv commiss decis pursuant articl b council regul ec no 139/20041 articl agreement european econom area2 dear sir madam on februari european commiss receiv notif propos concentr pursuant articl merger regul undertak phillip compani “ p66 ” unit state h2 energi europ ag “ h2 ee ” switzerland acquir within mean articl b merger regul joint control whole undertak newco austria way purchas shares.3 the busi activ concern undertak \uf02d p66 energi compani amongst other run fuel station \uf02d h2 ee activ hydrogen technolog develop ecosystem renew hydrogen oj l 29.1.2004 p. ‘ me

### Coreference resolution

In [2]:
# installing neuralcoref from source
#!git clone https://github.com/huggingface/neuralcoref.git
#!cd "D:\Desktop\Thesis\predicting-merger-decision-outcomes\src\python\notebook\neuralcoref"
# !pip install -r requirements.txt
# !pip install -e .
# !pip install spacy
# !pip install -U neuralcoref

In [13]:
# import neuralcoref
# import spacy

In [15]:
# nlp = spacy.load('en_core_web_lg') 
# neuralcoref.add_to_pipe(nlp)

In [None]:
import spacy
nlp = spacy.load('en')

In [None]:
def coref_res(texts):
    doc = nlp(texts)
    clean = doc._.coref_resolved
    return clean

df['text_clean'] = [coref_res(text) for text in df['text_clean']]

### Language detection

In [None]:
for index, row in df.iterrows():
    df.at[index, 'language'] = detect(df.at[index, 'text_clean'])

df['language'].value_counts()

In [None]:
df = df[df['language']=="en"]