In [1]:
# This script generate the topic modeling for the corpus 
# It find the topic collections of documents and documents' prob. distribution over topics
# Run it on the longleaf server

In [9]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from gensim import corpora,models
import gensim
import nltk
import re
import json
import os

In [3]:
def load_corpus(folder_path):
    
    '''
    Args: folder path
    Returns: id list and corresponding content list 
    '''
    
    news_id_list = []
    news_content_list = []
    
    for file in os.listdir(folder_path):
        if file[-5:] == ".json":
            with open(folder_path + file, 'r', encoding="utf-8") as f:
                news = json.load(f)
                news_content = news["content"]
                news_content_list.append(news_content)
                news_id = news["id"]
                news_id_list.append(news_id)
            f.close()
    
    return news_id_list, news_content_list

In [4]:
def clean_text(doc_set):
    
    '''
    Args: a list of documents
    Returns: a list of cleaned text
    '''
    
    cleaned_texts = []
    
    for doc in doc_set:
    
        # clean and tokenize document string
        raw_text = doc.lower()
        raw_text = re.sub(r"[^\w\s]", "", raw_text)
        tokens = nltk.word_tokenize(raw_text)

        # remove stop words from tokens
        stop = stopwords.words("english")
        cleaned_tokens = [token for token in tokens if token not in stop]

        # add tokens to list
        cleaned_texts.append(cleaned_tokens)
        
    return cleaned_texts

In [21]:
def training_lda(cleaned_texts, n_topics):
    
    '''
    Args: a list of cleaned text
    Returns: a lda model
    '''
    
    dictionary = corpora.Dictionary(cleaned_texts)
    corpus = [dictionary.doc2bow(text) for text in cleaned_texts]
    
    # need to tune parameter here
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=n_topics, id2word = dictionary, passes=10, 
                                               alpha = "auto", eta="auto", eval_every=2)
    
    return ldamodel, corpus, dictionary

In [22]:
def save_results(ldamodel, corpus, news_id_list, n_topics):
    
    '''
    Export LDA results to a file (topic collection and topic distributions of docs)
    Args: lda model, corpus, the list of news IDs, number of topics
    Returns: none
    
    '''
    
    # 1. get topic collections of documents
    topic_dictionary = {} # initialize a dictionrary of <topicid, [docid, docid, ..]>
    for i in range(0, n_topics):
        topic_dictionary[i] = []
    
    for i in range(0,len(corpus)):
        # get the topic with the highest probability
        most_probable_topic = ldamodel[corpus[i]][0][0]
        # add to the list
        topic_dictionary[most_probable_topic].append(news_id_list[i])
    
    # save to a json file
    with open("/Users/jiamingqu/Desktop/test.json", 'w') as f:
        json.dump(topic_dictionary,f)
    f.close()
    
    # 2. get document distribution probability of topics
    doc_prob_distribution = {}
    for i in range(0,len(corpus)):
        doc_prob_distribution[news_id_list[i]] = ldamodel[corpus[i]]
    
    # save to a json file
    for k,v in doc_prob_distribution.items():
        file_name = "/Users/jiamingqu/Desktop/test2/" + k + ".txt"
        with open(file_name, 'w') as f:
            for item in v:
                f.write(str(item))
                f.write("\n")
        f.close()

In [23]:
def main():
    news_id_list, news_content_list = load_corpus("/Users/jiamingqu/Desktop/test/")
    cleaned_text = clean_text(news_content_list)
    n_topics = 5
    ldamodel, corpus, dictionary = training_lda(cleaned_text, n_topics)
    save_results(ldamodel, corpus, news_id_list, n_topics)

In [24]:
if __name__ == "__main__":
    main()

2020-03-24 11:49:05,209 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-03-24 11:49:05,221 : INFO : built Dictionary(4172 unique tokens: ['alaska', 'appreciate', 'asking', 'based', 'book']...) from 26 documents (total 10472 corpus positions)
2020-03-24 11:49:05,283 : INFO : using autotuned alpha, starting with [0.2, 0.2, 0.2, 0.2, 0.2]
2020-03-24 11:49:05,285 : INFO : using serial LDA version on this node
2020-03-24 11:49:05,288 : INFO : running online (multi-pass) LDA training, 5 topics, 10 passes over the supplied corpus of 26 documents, updating model once every 26 documents, evaluating perplexity every 26 documents, iterating 50x with a convergence threshold of 0.001000
2020-03-24 11:49:05,348 : INFO : -10.183 per-word bound, 1162.5 perplexity estimate based on a held-out corpus of 26 documents with 10472 words
2020-03-24 11:49:05,349 : INFO : PROGRESS: pass 0, at document #26/26
2020-03-24 11:49:05,379 : INFO : optimized alpha [0.35917443, 0.34348047, 0.3353486

2020-03-24 11:49:05,727 : INFO : optimized alpha [0.049883414, 0.035024986, 0.032138966, 0.04721444, 0.038752023]
2020-03-24 11:49:05,731 : INFO : topic #0 (0.050): 0.007*"said" + 0.006*"women" + 0.006*"clinton" + 0.005*"va" + 0.005*"black" + 0.005*"batman" + 0.005*"also" + 0.004*"would" + 0.004*"sanders" + 0.004*"new"
2020-03-24 11:49:05,731 : INFO : topic #1 (0.035): 0.015*"trump" + 0.013*"june" + 0.012*"block" + 0.009*"stolen" + 0.008*"pm" + 0.005*"media" + 0.005*"one" + 0.005*"man" + 0.005*"8" + 0.004*"primary"
2020-03-24 11:49:05,732 : INFO : topic #2 (0.032): 0.014*"would" + 0.011*"budget" + 0.010*"said" + 0.009*"trump" + 0.006*"new" + 0.006*"programs" + 0.004*"also" + 0.004*"proposed" + 0.004*"funding" + 0.004*"police"
2020-03-24 11:49:05,733 : INFO : topic #3 (0.047): 0.009*"government" + 0.007*"speech" + 0.005*"money" + 0.005*"citizenship" + 0.004*"people" + 0.004*"political" + 0.004*"support" + 0.004*"ethnic" + 0.004*"pay" + 0.003*"said"
2020-03-24 11:49:05,733 : INFO : topic