In [1]:
########################################
# packages used in this script         
########################################

import numpy as np
import sys
import pandas as pd
import csv
csv.field_size_limit(sys.maxsize)
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import random
import pickle

from bs4 import BeautifulSoup as bs

In [11]:
###############################################################################################################################
# Function: read_data()                                
# Param:                                               
#     - path (str): the path to the M&A news data file(s).
# Return:
#     - data (nparray): the input data read from the data path.
###############################################################################################################################

def read_data(path):
    data = pd.read_csv(path, error_bad_lines=False)
    data = data.to_numpy()
    
    return data

In [12]:
################################################################################################################################
# Function: remove_inner_scheme()
# Param:
#     - data (nparray): the input data returned from the read_data() function
# Return:
#     - str_text (list): a list of cleaned news texts (which only contains text between <bodytext>)
################################################################################################################################

def remove_inner_scheme(data):
    str_text = []

    for row in data:
        temp_text = row[3]

        if type(temp_text) is not float:
            soup = bs(temp_text, "html.parser")
            cur_text = soup.findAll('bodytext')[0]

            str_text.append(cur_text.text)

    return str_text

In [13]:
################################################################################################################################
# Function: train_LDA()
# Param:
#     - str_text (list): the list of cleaned news texts returned from the remove_inner_scheme() function
# Return:
#     - lda (model): the trained LDA model with pure M&A news.
#     - cv (model): the trained CountVectorizer model with pure M&A news.
################################################################################################################################

def train_LDA(str_text):
    # train test split (for now all the M&A news are training. No testing is needed.)
    training_size = int(len(str_text) * 1.00)
    test_size = len(str_text) - training_size
    
    training_data = random.sample(str_text, training_size)
    test_data = []

    for row in str_text:
        if row not in training_data:
            test_data.append(row)
            
    # main part for LDA
    start_time = time.time()

    cv = CountVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english')
    df_train = cv.fit_transform(training_data[:])

    lda = LatentDirichletAllocation(n_components=10)
    lda.fit(df_train)

    print("running time: " + str(time.time()-start_time))
    return cv, lda

In [15]:
################################################################################################################################
# Function: print_topics()
# Param:
#     - lda (model): the trained LDA model returned from train_LDA() function.
#     - cv (model): the trained CountVectorizer model returned from train_LDA() function.
# Return:
#     - None
# Topics are printed in the stdout window
################################################################################################################################

def print_topics(lda, cv):
    print("printing topics and words...")
    print("")
    for index, topic in enumerate(lda.components_):
        print(f'Top 15 words for Topic #{index}')
        print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
        print('\n')

In [17]:
# main function
def main():
    print("Code starting...")
    # define path to data file. (In local machine, this jupyter notebook is in the same level with folder "data collection")
    path = "data collection/M&A_news.csv"
    
    # read data.
    data = read_data(path)
    print("Finish reading data...")
    
    # clean inner scheme
    str_text = remove_inner_scheme(data)
    print("Finish cleaning inner schemes...")
    
    # train LDA with pure M&A news
    cv, lda = train_LDA(str_text)
    print("Finish training LDA model...")
    
    # print topics
    print_topics(lda, cv)
    
    # save LDA model to "lda.sav" and CountVectorizer model to "cv.sav"
    pickle.dump(lda, open('lda.sav', 'wb'))
    pickle.dump(cv, open('cv.sav', 'wb'))
    print("Models saved...")
    
    print("Code finishing...")
    
if __name__ == "__main__":
    main()

Code starting...
Finish reading data...
Finish cleaning inner schemes...
running time: 23.26687979698181
Finish training LDA model...
printing topics and words...

Top 15 words for Topic #0
['real', 'based', 'ventures', 'rationale', 'estate', 'group', 'statuscompleted', 'venture', 'participantstarget', 'million', 'acquirer', 'management', 'investment', 'partners', 'capital']


Top 15 words for Topic #1
['managers', 'gross', 'intends', 'use', 'senior', 'acted', '15', 'public', 'capital', 'llc', 'notes', 'proceeds', 'securities', 'million', 'offering']


Top 15 words for Topic #2
['billion', '2018', 'merger', 'offering', 'capital', 'transaction', 'investment', 'said', 'intends', 'aramco', '2019', 'shares', 'ipo', 'saudi', 'update']


Top 15 words for Topic #3
['llp', 'legal', 'announced', 'capital', 'advisor', 'group', '000', 'acting', 'price', 'share', 'stock', 'offering', 'million', 'common', 'shares']


Top 15 words for Topic #4
['typeacquisitionsub', 'acquisition', 'llc', 'value', 'p