# Prep for Topic Modeling - Gensim

In [None]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.models.word2vec import Word2Vec

import os
import codecs
import itertools as it
import spacy



In [None]:
#nlp = spacy.load('en_core_web_lg')
nlp = spacy.load('en')

In [None]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def small_word(token):
    return len(token) < 3

def line_article(filename):
    """
    generator function to read in articles from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for article in f:
            yield article.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse articles,
    lemmatize the text, and yield sentences
    """
    
    for parsed_article in nlp.pipe(line_article(filename),
                                  batch_size=10000, n_threads=3):
        
        for sent in parsed_article.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not (punct_space(token) | small_word(token))])

In [None]:
data_directory = 'staging2002'
unigram_sentences_filepath = os.path.join(data_directory,'unigram_sentences_all.txt')

In [None]:
import glob

article_txt_filepath = 'full_articles.txt'

def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

read_files = glob.glob("articles/*.txt")
with open(article_txt_filepath, "wb") as outfile:
    for f in read_files:
        with open(f, "rb") as infile:
            print(f, " lines: ", file_len(f))
            outfile.write(infile.read())
outfile.close()  


In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if True:

    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(article_txt_filepath):
            f.write(sentence + '\n')
    

In [None]:
unigram_sentences = LineSentence(unigram_sentences_filepath)


In [None]:
for unigram_sentence in it.islice(unigram_sentences,230,240):
    print(u' '.join(unigram_sentence))
    #print(unigram_sentence)
    print(u'')

In [None]:
bigram_model_filepath = os.path.join(data_directory,'bigram_model_all')

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if True:

    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

In [None]:
bigram_sentences_filepath = os.path.join(data_directory,'bigram_sentences_all.txt')

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if True:

    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for unigram_sentence in unigram_sentences:
            
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            
            f.write(bigram_sentence + '\n')

In [None]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [None]:
for bigram_sentence in it.islice(bigram_sentences, 230, 240):
    print(u' '.join(bigram_sentence))
    print(u'')

In [None]:
trigram_model_filepath = os.path.join(data_directory,'trigram_model_all')

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if True:

    trigram_model = Phrases(bigram_sentences)

    trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

In [None]:
trigram_sentences_filepath = os.path.join(data_directory,'trigram_sentences_all.txt')

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if True:

    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for bigram_sentence in bigram_sentences:
            
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            
            f.write(trigram_sentence + '\n')

In [None]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [None]:
for trigram_sentence in it.islice(trigram_sentences, 230, 240):
    print(u' '.join(trigram_sentence))
    print(u'')

In [None]:
trigram_articles_filepath = os.path.join(data_directory,'trigram_transformed_articles_all.txt')

In [None]:
#from spacy.lang.en import STOP_WORDS
#print(STOP_WORDS)

STOP_WORDS = nlp.Defaults.stop_words
STOP_WORDS.add("-PRON-")
#nlp.Defaults.stop_words

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if True:

    with codecs.open(trigram_articles_filepath, 'w', encoding='utf_8') as f:
        
        for parsed_article in nlp.pipe(line_article(article_txt_filepath),
                                      batch_size=100000, n_threads=3):
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_article = [token.lemma_ for token in parsed_article
                              if not (punct_space(token) | small_word(token))]
            
            # apply the first-order and second-order phrase models
            bigram_article = bigram_model[unigram_article]
            trigram_article = trigram_model[bigram_article]
            
            # remove any remaining stopwords
            trigram_article = [term for term in trigram_article
                              if term not in STOP_WORDS ]
            
            # write the transformed article as a line in the new file
            trigram_article = u' '.join(trigram_article)
            f.write(trigram_article + '\n')

In [None]:
print(u'Original:' + u'\n')

for article in it.islice(line_article(article_txt_filepath), 240):
    print(article)

print(u'----' + u'\n')
print(u'Transformed:' + u'\n')

with codecs.open(trigram_articles_filepath, encoding='utf_8') as f:
    for article in it.islice(f, 240):
        print(article)

In [None]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import pickle

In [None]:
trigram_dictionary_filepath = os.path.join(data_directory,'trigram_dict_all.dict')


In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if True:

    trigram_articles = LineSentence(trigram_articles_filepath)
    
    # learn the dictionary by iterating over all of the articles
    trigram_dictionary = Dictionary(trigram_articles)
       
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    #trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.9)
    trigram_dictionary.compactify()
    
    # THIS WORKS
    #trigram_dictionary.filter_extremes(keep_n=25)
    #trigram_dictionary.compactify()
    #trigram_dictionary.filter_extremes(no_above=0.9)

    trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)
print(trigram_dictionary) 
    

In [None]:

trigram_bow_filepath = os.path.join(data_directory,'trigram_bow_corpus_all.mm')

In [None]:
def trigram_bow_generator(filepath):
    """
    generator function to read articles from a file
    and yield a bag-of-words representation
    """
    
    for article in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(article)

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if True:

    # generate bag-of-words representations for
    # all articles and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_articles_filepath))
    
# load the finished bag-of-words corpus from disk
#trigram_bow_corpus = MmCorpus(trigram_bow_filepath)


In [None]:
trigram_articles.max_sentence_length