# Topic Modeling Assessment Project

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('ds_cat_head_descr.csv')

In [4]:
df.head()

Unnamed: 0,category,head_descr
0,CRIME,"there were 2 mass shootings in teas last week,..."
1,ENTERTAINMENT,hugh grant marries for the first time at age 5...
2,ENTERTAINMENT,jim carrey blasts 'castrato' adam schiff and d...
3,ENTERTAINMENT,julianna margulies uses donald trump poop bags...
4,ENTERTAINMENT,morgan freeman 'devastated' that seual harassm...


# Preprocessing

#### Task: Use TF-IDF Vectorization to create a vectorized document term matrix. You may want to explore the max_df and min_df parameters.

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=1, stop_words='english')

In [7]:
dtm = tfidf.fit_transform(df['head_descr'])

In [8]:
dtm

<148982x76968 sparse matrix of type '<class 'numpy.float64'>'
	with 2393412 stored elements in Compressed Sparse Row format>

In [9]:
len(tfidf.get_feature_names())

76968

In [10]:
print(tfidf.get_feature_names()[0])

00


# Non-negative Matrix Factorization

#### TASK: Using Scikit-Learn create an instance of NMF with 41 expected components. (Use random_state=42).

In [11]:
from sklearn.decomposition import NMF

In [12]:
nmf_model = NMF(n_components=41,random_state=42)

In [13]:
nmf_model.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=41, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

#### TASK: Print our the top 15 most common words for each of the 41 topics.

In [14]:
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['tips', 'sleep', 'maintaining', 'prayers', 'meditations', 'spiritual', 'mindful', 'awareness', 'personal', 'meditation', 'hope', 'practice', 'daily', 'help', 'need']


THE TOP 15 WORDS FOR TOPIC #1
['election', 'presidency', 'doesn', 'america', 'supporters', 'news', 'russia', 'stephen', 'ban', 'colbert', 'administration', 'president', 'campaign', 'donald', 'trump']


THE TOP 15 WORDS FOR TOPIC #2
['album', 'shows', 'trailer', 'book', 'report', 'resolutions', 'hampshire', 'orleans', 'fashion', 'jersey', 'study', 'times', 'city', 'york', 'new']


THE TOP 15 WORDS FOR TOPIC #3
['look', 'beauty', 'photo', 'say', 'stylelist', 'huffpoststyle', 'instagram', 'sure', 'style', 'tumblr', 'huffpost', 'pinterest', 'check', 'facebook', 'twitter']


THE TOP 15 WORDS FOR TOPIC #4
['ladies', 'assault', 'abortion', '20', 'young', 'violence', 'gender', 'rights', 'female', 'march', 'girls', 'seual', 'woman', 'men', 'women']


THE TOP 15 WORDS FOR TOPIC #5
['honor', 'national

#### TASK: Add a new column to the original dataframe that labels each question into one of the 41 topic categories.

In [15]:
df.head()

Unnamed: 0,category,head_descr
0,CRIME,"there were 2 mass shootings in teas last week,..."
1,ENTERTAINMENT,hugh grant marries for the first time at age 5...
2,ENTERTAINMENT,jim carrey blasts 'castrato' adam schiff and d...
3,ENTERTAINMENT,julianna margulies uses donald trump poop bags...
4,ENTERTAINMENT,morgan freeman 'devastated' that seual harassm...


In [16]:
topic_results = nmf_model.transform(dtm)

In [17]:
topic_results.argmax(axis=1)

df['Topic'] = topic_results.argmax(axis=1)

df.head(10)

Unnamed: 0,category,head_descr,Topic
0,CRIME,"there were 2 mass shootings in teas last week,...",17
1,ENTERTAINMENT,hugh grant marries for the first time at age 5...,15
2,ENTERTAINMENT,jim carrey blasts 'castrato' adam schiff and d...,1
3,ENTERTAINMENT,julianna margulies uses donald trump poop bags...,38
4,ENTERTAINMENT,morgan freeman 'devastated' that seual harassm...,40
5,ENTERTAINMENT,donald trump is lovin' new mcdonald's jingle i...,1
6,ENTERTAINMENT,what to watch on amazon prime that’s new this ...,26
7,ENTERTAINMENT,mike myers reveals he'd 'like to' do a fourth ...,2
8,ENTERTAINMENT,what to watch on hulu that’s new this week you...,26
9,ENTERTAINMENT,justin timberlake visits teas school shooting ...,32


In [20]:
df[df['category']=='CRIME'].head(10)

Unnamed: 0,category,head_descr,Topic
0,CRIME,"there were 2 mass shootings in teas last week,...",17
28,CRIME,rachel dolezal faces felony charges for welfar...,28
36,CRIME,"man faces charges after pulling knife, stun gu...",40
38,CRIME,2 people injured in indiana school shooting a ...,32
179,CRIME,maryland police charge 3 church leaders with p...,40
219,CRIME,florida police report 2 dead after standoff at...,40
304,CRIME,"'this isn’t pakistan, bitch': video captures d...",16
349,CRIME,these are the victims of the santa fe high sch...,32
396,CRIME,hospice overdosed patients to 'hasten their de...,7
398,CRIME,former wwf wrestler severely beaten outside ca...,38


In [21]:
df[df['Topic']==19].head(10)

Unnamed: 0,category,head_descr,Topic
33,COMEDY,trump's new 'maga'-themed swimwear sinks on tw...,19
48,ENTERTAINMENT,hollywood doesn't need 'difficult' men to make...,19
61,POLITICS,trump's new eecutive orders make it easier to ...,19
112,POLITICS,cynthia nion vows to keep fighting after (pred...,19
273,POLITICS,judge orders teas to make voter registration e...,19
388,POLITICS,sen. dean heller’s campaign paid his social me...,19
417,POLITICS,business groups might be quietly killing a bil...,19
423,POLITICS,"facing farm bill vote problems, gop leaders ma...",19
440,IMPACT,the battle to save our dying soil this camp in...,19
464,ENTERTAINMENT,gq epertly spoofs vanity fair with their annua...,19
