## Software Requirements

In [None]:
!conda env create -f environment.yml

In [None]:
!conda activate thesis

In [66]:
# Data manipulation
import pandas as pd
import numpy as np
import csv
from zipfile import ZipFile

In [67]:
# Webscraping
import glob
import requests
from bs4 import BeautifulSoup
import time
import datetime
from pandas.core.common import flatten
import os
from itertools import chain
from tqdm import tqdm


In [68]:
# Parsing and pre-processing
import glob, re, os, sys, random
from random import shuffle

from pdfminer.high_level import extract_text
import pdfplumber
from langdetect import detect, DetectorFactory

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [69]:
# Vector representations and embeddings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import gensim

In [70]:
# Logistic, XGBOOST, SVM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from xgboost import XGBClassifier
import pickle

In [71]:
# LSTM 
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
import gc

In [72]:
# BERT models
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler
import transformers
from transformers import AutoModel, BertTokenizerFast

In [73]:
# specify GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

## Data Pre-processing

In [74]:
df = pd.read_json(r"../../../data/processed/df_eng_clean_filtered_2023_03_09.json")

### Removal of stopwords, punctuations, numeric characters

In [16]:
# nltk.download("stopwords")
# from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# TODO: retain digits, drop repeating terms based on TFIDF

In [76]:
def preprocess_corpus(texts):
    eng_stopwords = set(stopwords.words("english"))
    def remove_stops_digits(tokens):
        token_list =  [token.lower() for token in tokens if token not in eng_stopwords and token not in punctuation and token.isdigit() == False]
        processed_text = ' '.join(token_list)
        return processed_text
    return [remove_stops_digits(word_tokenize(text)) for text in texts]

In [77]:
df['text_clean'] = preprocess_corpus(df['sec_text'])

In [79]:
df['text_clean'][0]

"\uf02d sl04 private equity firm indirectly controlled l catterton whose portfolio companies operate retail restaurant business food beverage business consumer service consumer product businesses including production sale cosmetics fragrance products \uf02d ambienta private equity firm whose portfolio includes companies operating renewable power biofuels energy efficiency pollution mitigation waste water resource management sectors oj l 29.1.2004 p. 'merger regulation with effect december treaty functioning european union 'tfeu introduced certain changes replacement 'community 'union 'common market 'internal market the terminology tfeu used throughout decision oj l 3.1.1994 p. 'eea agreement publication official journal european union no c 26.7.2018 p."

### Stemming and lemmatization

In [80]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [81]:
def stem_lemmatize(text):
    stemmed = [stemmer.stem(token) for token in word_tokenize(text)]
    lemmatized = [lemmatizer.lemmatize(token) for token in stemmed]
    processed_text = ' '.join(lemmatized)
    return processed_text

In [82]:
df['text_clean'] = [stem_lemmatize(text) for text in df['text_clean']]

In [83]:
df['text_clean'][0]

"\uf02d sl04 privat equiti firm indirectli control l catterton whose portfolio compani oper retail restaur busi food beverag busi consum servic consum product busi includ product sale cosmet fragranc product \uf02d ambienta privat equiti firm whose portfolio includ compani oper renew power biofuel energi effici pollut mitig wast water resourc manag sector oj l 29.1.2004 p. 'merger regul with effect decemb treati function european union 'tfeu introduc certain chang replac 'commun 'union 'common market 'intern market the terminolog tfeu use throughout decis oj l 3.1.1994 p. 'eea agreement public offici journal european union no c 26.7.2018 p ."

In [86]:
# save json file name
date = datetime.date.today().strftime('%Y_%m_%d')

file_name = f"../../../data/processed/pre-processed_{date}.json"
if os.path.exists(file_name):
    os.remove(file_name)

# save file as json
df.to_json(file_name)

### Coreference resolution

In [None]:
df = pd.read_json(r"../../../data/processed/pre-processed_2023_03_09.json")

In [89]:
# #installing neuralcoref from source
# !git clone https://github.com/huggingface/neuralcoref.git
# !cd "D:\Desktop\Thesis\predicting-merger-decision-outcomes\src\python\notebook\neuralcoref"
# !pip install -r requirements.txt
# !pip install -e .
# !pip install spacy
# !pip install -U neuralcoref

In [95]:
import neuralcoref

In [91]:
import spacy

In [92]:
nlp = spacy.load('en_core_web_lg') 

In [None]:
neuralcoref.add_to_pipe(nlp)

In [93]:
# import spacy
# nlp = spacy.load('en')

In [None]:
def coref_res(texts):
    doc = nlp(texts)
    clean = doc._.coref_resolved
    return clean

df['text_clean'] = [coref_res(text) for text in df['text_clean']]

### Drop based on TFIDF

In [None]:
#TODO: remove highly repeating words