<b> Apply Topic Modelling (using LDA, NMF, TFIDF) on scraped SEC reports, to find topics discussed among all the reports </b>

In [2]:
import pandas as pd
import numpy as np
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from bs4 import BeautifulSoup
import os
# Plotting tools
#import pyLDAvis
#import pyLDAvis.gensim  
import matplotlib.pyplot as plt
import pyarrow
import pickle
from tqdm import tqdm, tqdm_notebook
from tqdm import notebook
import joblib
%matplotlib inline

In [7]:
#os.chdir("sec_10k_new2")
#.chdir("sec_10k_new")
files_req = [x for x in os.listdir()]

In [31]:
files_req[0].split("_")[0]

'carnival'

In [9]:
def clean_html(html_text):
    soup = BeautifulSoup(html_text)
    for script in soup(["script"]): 
        script.extract()
    fin_text = soup.get_text()
    fin_text = fin_text.replace("\n","").replace("\t","").replace("\xa0","").replace("\ufeff","")
    return(fin_text)

In [10]:
doc_df = pd.DataFrame()
i = 0
for file in notebook.tqdm(files_req):
    html_file = open(file, 'r', encoding='utf-8',errors='ignore')
    html_body = html_file.read() 
    clean_text = clean_html(html_body)
    doc_df = doc_df.append(pd.Series(["doc_"+str(i),clean_text]),ignore_index=True)
    i +=1

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




In [11]:
doc_df.columns = ["doc_name","text"]
joblib.dump(doc_df,"doc_df.pkl")

['doc_df.pkl']

### Text Pre-processing

Download required packages

In [12]:
import nltk
from nltk.corpus import stopwords as sw
from nltk.stem import WordNetLemmatizer 
from nltk.stem import SnowballStemmer
import gensim
from gensim.utils import simple_preprocess

In [13]:
def clean_text(text_blob):
    """
    This function aims to perform a series of text processing tasks - 
    For each paragraph
        a)basic pre-processing and removal of punctuations
        b)perform POS tagiing
        c)remove stopwords, non alphabetical character sequences and proper nouns
    
    Input : Simple raw text corpus
    Output : list of processed text
    """
    stopwords = sw.words("english")
    ps = SnowballStemmer("english")
    #ps = PorterStemmer()
    para = ' '.join(simple_preprocess(text_blob,deacc = True)) #Basic preprocessing and removal of punctuations
    tagged_list = nltk.tag.pos_tag(para.split())
    fin_text_blob = []
    for word,tag in tagged_list:
        if((word.lower() not in stopwords) and (word.isalpha()) and (len(word) >= 3) and (tag != "NNP" and tag != "NNPS")):
            fin_text_blob.extend([ps.stem(word.lower())])
    return(fin_text_blob) #This function hence returns the original text blob, processed 

In [14]:
text_list = []
for i in notebook.tqdm(range(doc_df.shape[0])):
    text_blob = clean_text(doc_df.text[i])
    text_list.extend([text_blob])

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




Create a bigram module

In [15]:
import gensim
bigram = gensim.models.Phrases(text_list, min_count=10, threshold=50) #Generating bigrams
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [16]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [17]:
data_words_bigrams = make_bigrams(text_list) #add bigrams to paragraph word lists

In [134]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_bigrams)

# Create Corpus
texts = data_words_bigrams

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
id2word.filter_extremes(no_below=4, no_above=0.5)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

### Document Similarity using gensim tfidf

In [81]:
#corpus
from gensim.models import TfidfModel
model = TfidfModel(corpus)
vector = model[corpus]

In [20]:
from gensim.test.utils import common_corpus, common_dictionary
from gensim.similarities import MatrixSimilarity

In [21]:
index = MatrixSimilarity(vector)

In [22]:
### Example
index[model[corpus[4]]]

array([1.0000001, 1.0000001, 0.       , 0.       , 1.0000001, 0.       ,
       0.       , 1.0000001, 0.       ], dtype=float32)

In [23]:
### Load test doc
doc_df2 = pd.DataFrame()
html_file = open(r"C:\Users\arnab\Desktop\Berkeley\MFE\Sessions\mfe_nlp\royal_caribbean_10k.html", 'r', encoding='utf-8',errors='ignore')
html_body = html_file.read() 
clean_text2 = clean_html(html_body)
doc_df2 = doc_df2.append(pd.Series(["doc_"+str(i),clean_text2]),ignore_index=True)
doc_df2.columns = ["doc_name","text"]

text_list2 = []
for i in notebook.tqdm(range(doc_df2.shape[0])):
    text_blob2 = clean_text(doc_df2.text[i])
    text_list2.extend([text_blob])

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [24]:
textArr2 = np.array(text_list2).ravel()

In [25]:
text2 = " ".join(textArr2)

In [26]:
corpus2 = [id2word.doc2bow(text) for text in text_list2]

In [27]:
print(index[model[corpus2]])

[[0.         0.         0.97311014 0.96655214 0.         0.97335756
  0.9701577  0.         0.97659695]]


### Document Similarity using sklearn tfidf

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=.65, min_df=4, stop_words='english')
transformed_documents = vectorizer.fit_transform(doc_df.text.values.astype('U'))

In [54]:
transformed_documents_as_array = transformed_documents.toarray()
# use this line of code to verify that the numpy array represents the same number of documents that we have in the file list
len(transformed_documents_as_array)

9

In [55]:
from sklearn.metrics.pairwise import cosine_similarity
def get_tf_idf_query_similarity(vectorizer, docs_tfidf, query):
    """
    vectorizer: TfIdfVectorizer model
    docs_tfidf: tfidf vectors for all docs
    query: query doc

    return: cosine similarity between query and all docs
    """
    query_tfidf = vectorizer.transform([query])
    cosineSimilarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    return cosineSimilarities

In [56]:
get_tf_idf_query_similarity(vectorizer, transformed_documents, text2)

array([0.12082512, 0.12082512, 0.11491558, 0.11661261, 0.12082512,
       0.11451854, 0.11330393, 0.12082512, 0.11310596])

### LDA Modeling

We now deploy LDA model on the corpus, to arrive at a list of an optimum number of "topics" in the paragraphs. We use a multicore version of LDA to arrive at convergence faster

In [109]:
lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=vector,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           chunksize=500,
                                           passes=20,
                                           alpha=0.01,
                                           eta='auto',
                                           minimum_probability = 0.3,
                                           minimum_phi_value=0.3,         
                                           decay=0.5,
                                           per_word_topics=True,workers=15)

In [107]:
lda_model.print_topics(num_words=8)

[(0,
  '0.003*"standardon" + 0.003*"specifyth" + 0.003*"taggedfact" + 0.003*"svg" + 0.003*"subtop" + 0.003*"stretch" + 0.003*"statementsbi" + 0.003*"startedth"'),
 (1,
  '0.014*"search" + 0.013*"contentsitem" + 0.012*"swap" + 0.011*"fund" + 0.011*"pipelin_pipelin" + 0.011*"period_period" + 0.011*"instal_instal" + 0.010*"fraction_fraction"'),
 (2,
  '0.003*"contentsitem" + 0.003*"search" + 0.003*"partner_partner" + 0.003*"period_period" + 0.003*"pipelin_pipelin" + 0.003*"origin_origin" + 0.003*"engin_engin" + 0.003*"background_imag"'),
 (3,
  '0.003*"fund" + 0.003*"contentsitem" + 0.003*"pipelin_pipelin" + 0.003*"loan_loan" + 0.003*"instal_instal" + 0.003*"fraction_fraction" + 0.003*"translat_translat" + 0.003*"engin_engin"'),
 (4,
  '0.003*"search" + 0.003*"background_imag" + 0.003*"popup" + 0.003*"webkit_box" + 0.003*"taxonomi_true" + 0.003*"deg" + 0.003*"set" + 0.003*"rgba"')]

<b> Coherence scores </b> are a very common metric for scoring the relevance of an LDA model. These scores are based on the idea of co-occurence of topic words in sliding windows on the raw text

In [97]:
# Compute   Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.7140088108352263


In [100]:
# def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
#     coherence_values = []
#     model_list = []
#     for num_topics in range(start, limit, step):
#         model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word,random_state=100)
#         model_list.append(model)
#         coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
#         coherence_values.append(coherencemodel.get_coherence())

#     return model_list, coherence_values
# # Can take a long time to run.
# model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_words_bigrams, start=2, limit=40, step=6)
# # Show graph
# limit=40; start=2; step=6;
# x = range(start, limit, step)
# plt.plot(x, coherence_values)
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.legend(("coherence_values"), loc='best')
# plt.show()

# # Print the coherence scores
# for m, cv in zip(x, coherence_values):
#     print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
    
# # Select the model and print the topics
# optimal_model = model_list[3]
# model_topics = optimal_model.show_topics(formatted=False)


### Paragraph Level Mapping

Now we create a function to map the the top 3 most dominant topics for every paragraph

In [139]:
# def format_topics_paragraphs(ldamodel=lda_model, corpus=corpus, texts=texts):
#     """
#     This function takes the lda model and the corpus as an input, so as to map the top topics and their percentages
#     Input: lda_model,corpus, raw_text
#     Output : Dataframe with top 3 topics and their percentages
#     """
#     # Init output
#     sent_topics_df = pd.DataFrame()

#     # Get main topic in each document
#     for i, row_list in tqdm(enumerate(ldamodel[corpus])):
#         row = row_list[0] if ldamodel.per_word_topics else row_list            
#         # print(row)
#         row = sorted(row, key=lambda x: (x[1]), reverse=True)
#         # Get the Dominant topic, Perc Contribution and Keywords for each document
#         for j, (topic_num, prop_topic) in enumerate(row):
#             if j == 0:  # => dominant topic
#                 wp = ldamodel.show_topic(topic_num)
#                 topic_keywords = ", ".join([word for word, prop in wp])
#                 ser1 = pd.Series([int(topic_num), round(prop_topic,4)])
#                # print(ser1)
#             if j == 1:
#                 ser1[2] = int(topic_num)
#                 ser1[3] = round(prop_topic,4)
#                 #print(ser1)
#             if j == 2:
#                 ser1[4] = int(topic_num)
#                 ser1[5] = round(prop_topic,4)
#                 #print(ser1)
#         sent_topics_df = sent_topics_df.append(ser1,ignore_index = True)
#         print(sent_topics_df)
#     sent_topics_df.columns = ['topic1', 'contrib1','topic2', 'contrib2','topic3', 'contrib3']
#     # Add original text to the end of the output
#     contents = pd.Series(texts)
#     sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
#     return(sent_topics_df)


# df_topic_sents_keywords = format_topics_paragraphs(ldamodel=lda_model, corpus=corpus, texts=texts)

# # Format
# df_dominant_topic = df_topic_sents_keywords.reset_index()
# df_dominant_topic.columns = ['Document_No','topic1', 'contrib1','topic2', 'contrib2','topic3', 'contrib3', 'Text']
# df_dominant_topic.head(10)

#### TF-IDF vectorization entire text corpus

In [169]:
doc_df['Preprocessed'] = texts
doc_df['Preprocessed'] = doc_df.apply(lambda x: " ".join(x['Preprocessed']),axis=1)

In [170]:
doc_df.head()

Unnamed: 0,doc_name,text,Preprocessed
0,doc_0,.picker_wrapper.no_alpha .picker_alpha{display...,display none posit_absolut index opac display ...
1,doc_1,.picker_wrapper.no_alpha .picker_alpha{display...,display none posit_absolut index opac display ...
2,doc_2,",term,score0,subsea,236.454126129562071,fmc,17...",term score subsea fmc surfac crude backlog sch...
3,doc_3,",term,score0,merchandise,138.547339529040271,k...",term score merchandis kohl store retail conten...
4,doc_4,.picker_wrapper.no_alpha .picker_alpha{display...,display none posit_absolut index opac display ...


In [171]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000,max_df=.65, min_df=5, stop_words='english')
tfidf_vect.fit(doc_df.Preprocessed.values)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.65, max_features=5000,
                min_df=5, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='\\w{1,}', tokenizer=None,
                use_idf=True, vocabulary=None)

In [172]:
doc_term_matrix = tfidf_vect.transform(doc_df.Preprocessed.values)

In [181]:
from sklearn.decomposition import LatentDirichletAllocation
LDA = LatentDirichletAllocation(n_components=3, random_state=42,max_iter=500)
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=500,
                          mean_change_tol=0.001, n_components=3, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [182]:
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['liabil', 'impair', 'servic', 'fuel', 'percent', 'depreci', 'accru', 'stock', 'tax', 'equiti']


Top 10 words for topic #1:
['adjust', 'defer', 'year', 'cash', 'amend', 'cost', 'incom', 'tabl', 'net', 'total']


Top 10 words for topic #2:
['liabil', 'impair', 'fuel', 'percent', 'servic', 'tax', 'accru', 'depreci', 'stock', 'equiti']




### Non Negative Matrix Factorization for Topic Modelling

In [175]:
#probability matrix that contains probabilities of all the words in the vocabulary for all the topics
from sklearn.decomposition import NMF

nmf = NMF(n_components=6, random_state=42,max_iter=500)
nmf.fit(doc_term_matrix )

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=500,
    n_components=6, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [176]:
for i,topic in enumerate(nmf.components_):
    print(f'Top 10 words for topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['dilut', 'defer', 'year', 'cash', 'amend', 'cost', 'incom', 'tabl', 'net', 'total']


Top 10 words for topic #1:
['plan', 'secur', 'total', 'asset', 'debt', 'liabil', 'decemb', 'revenu', 'tax', 'expens']


Top 10 words for topic #2:
['expens_expens', 'earn', 'expens', 'plan', 'revenu', 'debt', 'loss', 'decemb', 'tax', 'liabil']


Top 10 words for topic #3:
['revenu', 'liabil', 'secur', 'loss', 'plan', 'decemb', 'debt', 'expens_expens', 'asset', 'tax']


Top 10 words for topic #4:
['payabl', 'earn', 'plan', 'secur', 'decemb', 'asset', 'liabil', 'expens_expens', 'debt', 'tax']


Top 10 words for topic #5:
['differencesmay', 'diesel', 'diebold', 'dictat', 'diann', 'dhl', 'dform', 'dfe', 'differ', 'zone']


