# Prep for Topic Modeling - Gensim

In [1]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
from gensim.models.word2vec import Word2Vec

import os
import codecs
import itertools as it
import spacy





In [2]:
#nlp = spacy.load('en_core_web_lg')
nlp = spacy.load('en')

In [3]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_article(filename):
    """
    generator function to read in articles from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for article in f:
            yield article.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse articles,
    lemmatize the text, and yield sentences
    """
    
    for parsed_article in nlp.pipe(line_article(filename),
                                  batch_size=10000, n_threads=3):
        
        for sent in parsed_article.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [4]:
data_directory = 'staging'
unigram_sentences_filepath = os.path.join(data_directory,'unigram_sentences_all.txt')

In [5]:
%%time
article_txt_filepath = 'articles.txt'
# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if True:

    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(article_txt_filepath):
            f.write(sentence + '\n')
    

Wall time: 1min 44s


In [6]:
unigram_sentences = LineSentence(unigram_sentences_filepath)


In [7]:
for unigram_sentence in it.islice(unigram_sentences,230,240):
    print(u' '.join(unigram_sentence))
    #print(unigram_sentence)
    print(u'')

some research have suggest that caregiver tend to report great functional dependence in person with dementia skurla rogers and sunderland 1988

nevertheless one recent study have show that score derive from caregiver self report of function of a person with dementia use the fim significantly correlate with fim score derive from direct observation of performance by a train professional cotter burgio stephens roth gitlin in press

thus caregiver rating of function in -PRON- study may reflect objective iadl performance

there be no statistically significant difference however in the other eight outcome measure include adl dependence and behavior and caregiver self efficacy and upset score between the experimental and control group

the analysis show a trend toward improvement in all area for the experimental group but these minimal effect be not statistically significant for several possible reason

first one reason -PRON- do not see main effect be that -PRON- do find interaction effect s

In [8]:
bigram_model_filepath = os.path.join(data_directory,'bigram_model_all')

In [9]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if True:

    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

Wall time: 989 ms


In [10]:
bigram_sentences_filepath = os.path.join(data_directory,'bigram_sentences_all.txt')

In [11]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if True:

    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for unigram_sentence in unigram_sentences:
            
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            
            f.write(bigram_sentence + '\n')



Wall time: 1.74 s


In [12]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [13]:
for bigram_sentence in it.islice(bigram_sentences, 230, 240):
    print(u' '.join(bigram_sentence))
    print(u'')

some research have suggest_that caregiver tend_to report great functional dependence in person with_dementia skurla rogers and sunderland 1988

nevertheless one recent study have show that score derive_from caregiver self_report of function of a person with_dementia use the fim significantly_correlate with fim score derive_from direct_observation of performance by a train professional cotter burgio stephens roth gitlin in press

thus caregiver rating of function in -PRON- study may reflect objective iadl performance

there be no statistically_significant difference however in the other eight outcome_measure include adl_dependence and behavior and caregiver self_efficacy and upset score between the experimental and control_group

the analysis show a trend_toward improvement in all area for the experimental group but these minimal effect be not statistically_significant for several possible reason

first one reason -PRON- do_not see main_effect be that -PRON- do find interaction_effect s

In [14]:
trigram_model_filepath = os.path.join(data_directory,'trigram_model_all')

In [15]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if True:

    trigram_model = Phrases(bigram_sentences)

    trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

Wall time: 967 ms


In [16]:
trigram_sentences_filepath = os.path.join(data_directory,'trigram_sentences_all.txt')

In [17]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if True:

    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for bigram_sentence in bigram_sentences:
            
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            
            f.write(trigram_sentence + '\n')



Wall time: 1.82 s


In [18]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [19]:
for trigram_sentence in it.islice(trigram_sentences, 230, 240):
    print(u' '.join(trigram_sentence))
    print(u'')

some research have suggest_that caregiver tend_to report great functional dependence in person_with_dementia skurla rogers and sunderland 1988

nevertheless one recent_study have show_that score derive_from caregiver self_report of function of a person_with_dementia use the fim significantly_correlate_with fim score derive_from direct_observation of performance by a train professional cotter burgio stephens roth gitlin in press

thus caregiver rating of function in -PRON- study may reflect objective iadl performance

there be no statistically_significant difference however in the other eight outcome_measure include adl_dependence and behavior and caregiver self_efficacy and upset score between the experimental and control_group

the analysis show a trend_toward improvement in all area for the experimental_group but these minimal effect be not_statistically_significant for several possible reason

first one reason -PRON- do_not see main_effect be that -PRON- do find interaction_effect s

In [20]:
trigram_articles_filepath = os.path.join(data_directory,'trigram_transformed_articles_all.txt')

In [21]:
#from spacy.lang.en import STOP_WORDS
#print(STOP_WORDS)

STOP_WORDS = nlp.Defaults.stop_words
#nlp.Defaults.stop_words

In [22]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if True:

    with codecs.open(trigram_articles_filepath, 'w', encoding='utf_8') as f:
        
        for parsed_article in nlp.pipe(line_article(article_txt_filepath),
                                      batch_size=10000, n_threads=3):
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_article = [token.lemma_ for token in parsed_article
                              if not punct_space(token)]
            
            # apply the first-order and second-order phrase models
            bigram_article = bigram_model[unigram_article]
            trigram_article = trigram_model[bigram_article]
            
            # remove any remaining stopwords
            trigram_article = [term for term in trigram_article
                              if term not in STOP_WORDS ]
            
            # write the transformed article as a line in the new file
            trigram_article = u' '.join(trigram_article)
            f.write(trigram_article + '\n')



Wall time: 1min 35s


In [23]:
print(u'Original:' + u'\n')

for article in it.islice(line_article(article_txt_filepath), 1):
    print(article)

print(u'----' + u'\n')
print(u'Transformed:' + u'\n')

with codecs.open(trigram_articles_filepath, encoding='utf_8') as f:
    for article in it.islice(f, 1):
        print(article)

Original:

     Abstract Purpose of Study: The authors determined short-term effects of a home environmental intervention on self-efficacy and upset in caregivers and daily function of dementia patients. They also determined if treatment effect varied by caregiver gender, race, and relationship to patient. Design and Methods: Families (N = 171) of dementia patients were randomized to intervention or usual care control group. The intervention involved 5 90-min home visits by occupational therapists who provided education and physical and social environmental modifi-cations. Results: Compared with controls, intervention caregivers reported fewer declines in patients' instrumental activities of daily living (p = .030) and less decline in self-care and fewer behavior problems in patients at 3 months post-test. Also, intervention spouses reported reduced upset (p = .049), women reported enhanced self-efficacy in managing behaviors (p = .038), and women (p = .049) and minorities (p = .037) r


----

Transformed:

abstract_purpose study author determine short term effect home environmental_intervention self_efficacy upset caregiver daily function dementia_patient -PRON- determine_if treatment_effect vary_by caregiver gender_race relationship patient design methods family n_= 171 dementia_patient randomize intervention usual care control_group intervention involve 5 90-min home_visit occupational_therapist provide education physical social environmental modifi cation result compare_with control intervention caregiver report decline patient instrumental_activity daily_living p_= .030 decline self_care behavior_problem patient 3_month post_test intervention spouse report reduce_upset p_= .049 woman report enhanced self_efficacy manage behavior p_= .038 woman p_= .049 minority p_= .037 report enhanced self_efficacy manage functional_dependency implication environmental program appear modest effect_on dementia_patient iadl_dependence certain subgroup caregiver program improve sel




In [24]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import pickle

In [25]:
trigram_dictionary_filepath = os.path.join(data_directory,'trigram_dict_all.dict')


In [26]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if True:

    trigram_articles = LineSentence(trigram_articles_filepath)
    
    # learn the dictionary by iterating over all of the articles
    trigram_dictionary = Dictionary(trigram_articles)
       
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    #trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    #trigram_dictionary.compactify()
    
    # THIS WORKS
    #trigram_dictionary.filter_extremes(keep_n=25)
    #trigram_dictionary.compactify()
    trigram_dictionary.filter_extremes(no_above=0.9)

    trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)
print(trigram_dictionary) 
    

Dictionary(3953 unique tokens: ['$', "'s_own", '.05', '.73', '.83']...)
Wall time: 201 ms


In [27]:

trigram_bow_filepath = os.path.join(data_directory,'trigram_bow_corpus_all.mm')

In [28]:
def trigram_bow_generator(filepath):
    """
    generator function to read articles from a file
    and yield a bag-of-words representation
    """
    
    for article in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(article)

In [29]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if True:

    # generate bag-of-words representations for
    # all articles and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_articles_filepath))
    
# load the finished bag-of-words corpus from disk
#trigram_bow_corpus = MmCorpus(trigram_bow_filepath)


Wall time: 190 ms
