# Prep for Topic Modeling - Gensim

In [188]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.models.word2vec import Word2Vec

import os
import codecs
import itertools as it
import spacy



In [189]:
#nlp = spacy.load('en_core_web_lg')
nlp = spacy.load('en')

In [190]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def small_word(token):
    return len(token) < 3

def line_article(filename):
    """
    generator function to read in articles from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for article in f:
            yield article.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse articles,
    lemmatize the text, and yield sentences
    """
    
    for parsed_article in nlp.pipe(line_article(filename),
                                  batch_size=10000, n_threads=3):
        
        for sent in parsed_article.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not (punct_space(token) | small_word(token))])

In [191]:
data_directory = 'staging2002'
unigram_sentences_filepath = os.path.join(data_directory,'unigram_sentences_all.txt')

In [239]:
import glob

article_txt_filepath = 'full_articles.txt'

def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

read_files = glob.glob("articles/*.txt")
with open(article_txt_filepath, "wb") as outfile:
    for f in read_files:
        with open(f, "rb") as infile:
            print(f, " lines: ", file_len(f))
            outfile.write(infile.read())
outfile.close()  


articles/articles2001.txt  lines:  89
articles/articles2002.txt  lines:  102


In [247]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if True:

    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(article_txt_filepath):
            f.write(sentence + '\n')
    

CPU times: user 1min 24s, sys: 3.03 s, total: 1min 27s
Wall time: 1min 29s


In [248]:
unigram_sentences = LineSentence(unigram_sentences_filepath)


In [249]:
for unigram_sentence in it.islice(unigram_sentences,230,240):
    print(u' '.join(unigram_sentence))
    #print(unigram_sentence)
    print(u'')

caregiver may need more time practice and use environmental strategy before beneficial outcome be measurable

the 3-month post test may have be too close the intervention for adequately evaluate treatment effect

few caregiver intervention study have show delayed intervention effect such that caregiver report reduce burden and less depression but only over extended period time mittelman

1995

study environmental intervention with other population have also report delayed positive effect year mann ottenbacher fraas tomita and granger 1999

future research should consider evaluate the impact home environmental strategy over long time period

third may that environmental approach for caregiver require high dose and level intensity than that test this study

case presentation and anecdotal comment the interventionist support this point

interventionist report that some caregiver appear need more time than the protocol allow practice and incorporate the recommend environmental strategy

al

In [250]:
bigram_model_filepath = os.path.join(data_directory,'bigram_model_all')

In [251]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if True:

    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

CPU times: user 3.31 s, sys: 180 ms, total: 3.49 s
Wall time: 3.53 s


In [252]:
bigram_sentences_filepath = os.path.join(data_directory,'bigram_sentences_all.txt')

In [253]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if True:

    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for unigram_sentence in unigram_sentences:
            
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            
            f.write(bigram_sentence + '\n')



CPU times: user 5.48 s, sys: 25.1 ms, total: 5.5 s
Wall time: 5.53 s


In [254]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [255]:
for bigram_sentence in it.islice(bigram_sentences, 230, 240):
    print(u' '.join(bigram_sentence))
    print(u'')

caregiver may need more time practice and use environmental_strategy before beneficial outcome be measurable

the 3-month post_test may have be too close the intervention for adequately evaluate treatment effect

few caregiver intervention study have show delayed intervention effect such that caregiver report reduce burden and less depression but only over extended_period time mittelman

1995

study environmental intervention with other population have also report delayed positive_effect year mann ottenbacher fraas tomita and granger 1999

future_research should_consider evaluate the impact home environmental_strategy over long time_period

third may that environmental approach for caregiver require high_dose and level intensity than that test this_study

case presentation and anecdotal comment the interventionist support this point

interventionist report that some caregiver appear need more time than the protocol allow practice and incorporate the recommend environmental_strategy

al

In [256]:
trigram_model_filepath = os.path.join(data_directory,'trigram_model_all')

In [257]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if True:

    trigram_model = Phrases(bigram_sentences)

    trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

CPU times: user 3.44 s, sys: 157 ms, total: 3.59 s
Wall time: 3.64 s


In [258]:
trigram_sentences_filepath = os.path.join(data_directory,'trigram_sentences_all.txt')

In [259]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if True:

    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for bigram_sentence in bigram_sentences:
            
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            
            f.write(trigram_sentence + '\n')



CPU times: user 5.83 s, sys: 41.2 ms, total: 5.87 s
Wall time: 6.13 s


In [260]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [261]:
for trigram_sentence in it.islice(trigram_sentences, 230, 240):
    print(u' '.join(trigram_sentence))
    print(u'')

caregiver may need more time practice and use environmental_strategy before beneficial outcome be measurable

the 3-month post_test may have be too close the intervention for adequately evaluate treatment effect

few caregiver intervention study have show delayed intervention effect such that caregiver report reduce burden and less depression but only over extended_period time mittelman

1995

study environmental intervention with other population have also report delayed positive_effect year mann ottenbacher fraas tomita and granger 1999

future_research should_consider evaluate the impact home environmental_strategy over long time_period

third may that environmental approach for caregiver require high_dose and level intensity than that test this_study

case presentation and anecdotal comment the interventionist support this point

interventionist report that some caregiver appear need more time than the protocol allow practice and incorporate the recommend environmental_strategy

al

In [262]:
trigram_articles_filepath = os.path.join(data_directory,'trigram_transformed_articles_all.txt')

In [263]:
#from spacy.lang.en import STOP_WORDS
#print(STOP_WORDS)

STOP_WORDS = nlp.Defaults.stop_words
STOP_WORDS.add("-PRON-")
#nlp.Defaults.stop_words

In [264]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if True:

    with codecs.open(trigram_articles_filepath, 'w', encoding='utf_8') as f:
        
        for parsed_article in nlp.pipe(line_article(article_txt_filepath),
                                      batch_size=100000, n_threads=3):
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_article = [token.lemma_ for token in parsed_article
                              if not (punct_space(token) | small_word(token))]
            
            # apply the first-order and second-order phrase models
            bigram_article = bigram_model[unigram_article]
            trigram_article = trigram_model[bigram_article]
            
            # remove any remaining stopwords
            trigram_article = [term for term in trigram_article
                              if term not in STOP_WORDS ]
            
            # write the transformed article as a line in the new file
            trigram_article = u' '.join(trigram_article)
            f.write(trigram_article + '\n')



CPU times: user 1min 33s, sys: 2.64 s, total: 1min 35s
Wall time: 1min 37s


In [265]:
print(u'Original:' + u'\n')

for article in it.islice(line_article(article_txt_filepath), 240):
    print(article)

print(u'----' + u'\n')
print(u'Transformed:' + u'\n')

with codecs.open(trigram_articles_filepath, encoding='utf_8') as f:
    for article in it.islice(f, 240):
        print(article)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



abstract_purpose purpose_this_study develop empirically base psychometrically sound instrument assessment grief caregiver person alzheimer_disease design method total 184 statement address personal grief_reaction obtain_from adult_child spouse_caregiver focus_group represent early middle late postdeath stage present second sample adult_child spouse_caregiver rat accord current experience administer beck_depression_inventory anticipatory_grief scale caregiver_strain index caregiver well_being scale basic needs perceived social_support family questionnaire result factor analysis result 50-item scale contain factor personal_sacrifice_burden heartfelt_sadness longing worry felt isolation cronbach_alpha score_range_from .90 .96 indicate high_internal_consistency reliability factor combine total_correlation factor measure provide evidence validity especially_when assess context caregiver_grief model implication result suggest_that caregiver_grief unitary static construct scale appropriate us

In [266]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import pickle

In [267]:
trigram_dictionary_filepath = os.path.join(data_directory,'trigram_dict_all.dict')


In [268]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if True:

    trigram_articles = LineSentence(trigram_articles_filepath)
    
    # learn the dictionary by iterating over all of the articles
    trigram_dictionary = Dictionary(trigram_articles)
       
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    #trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.9)
    trigram_dictionary.compactify()
    
    # THIS WORKS
    #trigram_dictionary.filter_extremes(keep_n=25)
    #trigram_dictionary.compactify()
    #trigram_dictionary.filter_extremes(no_above=0.9)

    trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)
print(trigram_dictionary) 
    

Dictionary(3446 unique tokens: ['.001', '.03', '.05', '.08', '.73']...)
CPU times: user 704 ms, sys: 75.5 ms, total: 779 ms
Wall time: 812 ms


In [269]:

trigram_bow_filepath = os.path.join(data_directory,'trigram_bow_corpus_all.mm')

In [270]:
def trigram_bow_generator(filepath):
    """
    generator function to read articles from a file
    and yield a bag-of-words representation
    """
    
    for article in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(article)

In [271]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if True:

    # generate bag-of-words representations for
    # all articles and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_articles_filepath))
    
# load the finished bag-of-words corpus from disk
#trigram_bow_corpus = MmCorpus(trigram_bow_filepath)


CPU times: user 557 ms, sys: 14 ms, total: 571 ms
Wall time: 576 ms


In [272]:
trigram_articles.max_sentence_length

10000