In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

if __name__ == '__main__':
    filename = 'articles.txt'
    n_topics = 5
    n_top_words = 10
    
    with open(filename, 'r') as f:
        text_data = f.readlines() 
    
    tfidf = TfidfVectorizer(stop_words = 'english', ngram_range=(1,2), max_df = 0.95, min_df = 1)
    doc_term_mat = tfidf.fit_transform(text_data)
    tfidf_feature_names = tfidf.get_feature_names()
    
    print('\nTFIDF-NMF topics: \n')
    nmf = NMF(n_components = n_topics, random_state = 0).fit(doc_term_mat)
    print_top_words(nmf, tfidf_feature_names, n_top_words)


TFIDF-NMF topics: 

Topic #0: caregivers, caregiver, caregiving, care, burden, dementia, nhp, family, intervention, grief
Topic #1: ageism, stereotypes, palmore, age, implicit, ageist, implicit ageism, levy, older, people
Topic #2: care, ltc, home, services, home care, term care, medicaid, long term, policy, nursing
Topic #3: residents, mds, resident, nursing, care, staff, nh, quality, facilities, pain
Topic #4: older, depression, health, religious, age, study, respondents, sample, widows, table

