In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

if __name__ == '__main__':
    filename = 'articles.txt'
    n_topics = 20
    n_top_words = 15
    
    with open(filename, 'r') as f:
        text_data = f.readlines() 
    
    tfidf = TfidfVectorizer(stop_words = 'english', ngram_range=(1,2), max_df = 0.95, min_df = 1)
    doc_term_mat = tfidf.fit_transform(text_data)
    tfidf_feature_names = tfidf.get_feature_names()
    
    print('\nTFIDF-NMF topics: \n')
    nmf = NMF(n_components = n_topics, random_state = 0).fit(doc_term_mat)
    print_top_words(nmf, tfidf_feature_names, n_top_words)


TFIDF-NMF topics: 

Topic #0: caregivers, caregiving, caregiver, care, burden, nhp, grief, family, dementia, study, american, cdr, grandparents, adjustment, elder
Topic #1: ageism, stereotypes, palmore, age, implicit, ageist, implicit ageism, levy, older, old, people, survey, age stereotypes, workers, racism
Topic #2: care, home care, home, sufficiency, unmet, quality, sufficiency care, informal, claimants, needs, formal, hha, recipients, functional, quality circumstances
Topic #3: mds, nh, pain, residents, gds short, gds, nhppt, vas, staff, nh residents, performance, qi, site, scale, interview
Topic #4: depression, emotions, gds, ces, wav, subthreshold, minor, symptoms, subthreshold depression, minor depression, depressive, nondepressed, scale, depressives, ces scale
Topic #5: residents, resident, care, edna, nursing, quality, ltc, contact, palliative, palliative care, staff, decision making, facilities, facility, satisfaction
Topic #6: widowers, widows, efficacy, self efficacy, wido