In [66]:
# Data manipulation
import pandas as pd
import numpy as np

In [68]:
# Pre-processing
import glob, re, os, sys, random

from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_numeric, strip_multiple_whitespaces, strip_non_alphanum, stem_text, remove_stopwords

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from string import punctuation
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

## Data Pre-processing

In [168]:
df = pd.read_json(r"../../../data/processed/df_eng_clean_filtered_2023_03_11.json")

In [169]:
df['count'] = df['sec_text'].str.split().apply(len)

Lowercase and Remove non-alphanum, digits, punctuations, stopwords

In [170]:
preprocess_function = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_non_alphanum, strip_multiple_whitespaces, strip_numeric, remove_stopwords] #stem_text

df['text_clean'] = df['sec_text'].apply(lambda x: " ".join(preprocess_string(str(x), CUSTOM_FILTERS)))

Lemmatize text, no stemming

In [171]:
lemmatizer = WordNetLemmatizer()
# stemmer = PorterStemmer()

def stem_lemmatize(text):
    #stemmed = [stemmer.stem(token) for token in word_tokenize(text)]
    lemmatized = [lemmatizer.lemmatize(token) for token in word_tokenize(text)]
    processed_text = ' '.join(lemmatized)
    return processed_text

In [172]:
df['text_clean'] = [stem_lemmatize(text) for text in df['text_clean']]

Get length of text_clean

In [173]:
df['count_text'] = df['text_clean'].str.split().apply(len)

In [188]:
print("min:", min(df['count_text']), 
    "\nmax:", max(df['count_text']))

min: 9 
max: 138766


In [209]:
df[df['count_text']<=20].article_new.value_counts()

simplified        311
article6(1)(b)      3
article6(2)         1
Name: article_new, dtype: int64

In [219]:
# no of cases by article_new
df.groupby('article_new')['case_num'].nunique().sort_index()

article_new
article6(1)(b)    1284
article6(2)        201
article8(1)         32
article8(2)         57
article8(3)          9
referral            59
simplified        3194
Name: case_num, dtype: int64

Add labels

In [226]:
# Phase 1 vs. Phase 2
df['phase2'] = np.where((df['article_new'].isin(['article6(1)(b)', 'article6(2)'])), 0, 1)
df['phase2'] = np.where((df['article_new'].isin(['referral', 'simplified'])), df['article_new'], df['phase2'])

df.groupby('phase2')['case_num'].nunique()

phase2
0             1485
1               98
referral        59
simplified    3194
Name: case_num, dtype: int64

In [227]:
df['phase2'].value_counts()

0             5079
simplified    3194
1              292
referral       204
Name: phase2, dtype: int64

In [216]:
# df['wc'] =""
# df['wc'] = np.where((df['article_new'].isin(['article6(2)', 'article8(2)'])), 1, 0)
# df.groupby('wc')['case_num'].nunique()
# df[(df['wc']==1)]['article_new'].value_counts()
# df[(df['wc']==1)].groupby('article_new')['case_num'].nunique() # is there a duplicate by casse_num but different filename?

wc
0    4548
1     257
Name: case_num, dtype: int64

In [228]:
# With conditions vs. Without conditions
df['wc'] = np.where((df['article_new'].isin(['article6(2)', 'article8(2)'])), 1, 0)
df['wc'] = np.where((df['article_new'].isin(['referral', 'simplified', 'article8(3)'])), df['article_new'], df['wc'])

df.groupby('wc')['case_num'].nunique()


wc
0              1316
1               257
article8(3)       9
referral         59
simplified     3194
Name: case_num, dtype: int64

In [229]:
df['wc'].value_counts()

0              4509
simplified     3194
1               834
referral        204
article8(3)      28
Name: wc, dtype: int64

In [230]:
# Potentially anticompetitive vs. No harm to competition
df['competition'] = np.where((df['article_new'].isin(['article6(2)', 'article8(2)', 'article8(3)'])), 0, 1)
df['competition'] = np.where((df['article_new'].isin(['referral', 'simplified'])), df['article_new'], df['competition'])

df.groupby('competition')['case_num'].nunique()

competition
0              266
1             1316
referral        59
simplified    3194
Name: case_num, dtype: int64

In [231]:
df['competition'].value_counts()

1             4509
simplified    3194
0              862
referral       204
Name: competition, dtype: int64

In [232]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8769 entries, 0 to 11041
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   year         8769 non-null   int64 
 1   article_new  8769 non-null   object
 2   case_num     8769 non-null   object
 3   file         8769 non-null   object
 4   section_fin  8769 non-null   object
 5   len_pdf      8769 non-null   int64 
 6   sec_text     8769 non-null   object
 7   count        8769 non-null   int64 
 8   text_clean   8769 non-null   object
 9   count_text   8769 non-null   int64 
 10  phase2       8769 non-null   object
 11  wc           8769 non-null   object
 12  competition  8769 non-null   object
dtypes: int64(4), object(9)
memory usage: 1.2+ MB


In [233]:
# save json file name
date = datetime.date.today().strftime('%Y_%m_%d')

file_name = f"../../../data/processed/pre-processed_{date}.json"
if os.path.exists(file_name):
    os.remove(file_name)

# save file as json
df.to_json(file_name)

Pre-process with NLTk backup

In [103]:
# nltk.download("stopwords")
# from nltk.corpus import stopwords

In [104]:
# nltk.download('punkt')

In [105]:
## Removal of stopwords, punctuations, numeric characters
# def preprocess_corpus(texts):
#     eng_stopwords = set(stopwords.words("english"))
#     def remove_stops_digits(tokens):
#         token_list =  [token.lower() for token in tokens if token not in eng_stopwords and token not in punctuation and token.isdigit() == False]
#         processed_text = ' '.join(token_list)
#         return processed_text
#     return [remove_stops_digits(word_tokenize(text)) for text in texts]

Coreference resolution

In [None]:
# df = pd.read_json(r"../../../data/processed/pre-processed_2023_03_11.json")

In [89]:
# #installing neuralcoref from source
# !git clone https://github.com/huggingface/neuralcoref.git
# !cd "D:\Desktop\Thesis\predicting-merger-decision-outcomes\src\python\notebook\neuralcoref"
# !pip install -r requirements.txt
# !pip install -e .
# !pip install spacy
# !pip install -U neuralcoref

In [95]:
# import neuralcoref

In [91]:
# import spacy

In [92]:
# nlp = spacy.load('en_core_web_lg') 

In [None]:
# neuralcoref.add_to_pipe(nlp)

In [93]:
# import spacy
# nlp = spacy.load('en')

In [None]:
# def coref_res(texts):
#     doc = nlp(texts)
#     clean = doc._.coref_resolved
#     return clean

# df['text_clean'] = [coref_res(text) for text in df['text_clean']]