# Topic modeling with Scikit Testing

- Brian Kalinowski 11/3/2019

In [41]:
import pandas as pd
import numpy as np
from collections import *
from typing import *
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
import pyLDAvis.sklearn

In [42]:
fake_news_content = pd.read_csv('/Users/briankalinowski/Desktop/Data/fake_news_nlp_content.csv')
fake_news_content.head()

Unnamed: 0,title,text,type,tokenized_headline,tokenized_content
0,Muslims BUSTED They Stole Millions In Govt Ben...,Print They should pay all the back all the mon...,bias,muslims,somalis
1,Re Why Did Attorney General Loretta Lynch Plea...,Why Did Attorney General Loretta Lynch Plead T...,bias,loretta lynch,loretta lynch barracuda brigade iran loretta l...
2,BREAKING Weiner Cooperating With FBI On Hillar...,Red State Fox News Sunday reported this mornin...,bias,weiner fbi hillary email investigation,red state fox news anthony weiner fbi hillary ...
3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,bias,donald j. trump,kayla mueller isis carl mueller donald trump
4,FANTASTIC! TRUMPS 7 POINT PLAN To Reform Healt...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,bias,notokes,house senate obamacare obamacare democrats oba...


In [43]:
vectorizer = CountVectorizer(min_df=10, stop_words=['notokes'])
content_vectorized = vectorizer.fit_transform(fake_news_content.tokenized_content)

In [44]:
lda = LatentDirichletAllocation(n_components=10, max_iter=10, learning_method='online', verbose=True)
fake_news_topics = lda.fit_transform(content_vectorized)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [45]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, content_vectorized, vectorizer, sort_topics=True, mds='tsne')
dash

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [46]:
news_topics = pd.Series([fake_news_topics[n].argmax() for n in range(fake_news_topics.shape[0])])
fake_news_content['topic_id'] = news_topics
fake_news_content.head()

Unnamed: 0,title,text,type,tokenized_headline,tokenized_content,topic_id
0,Muslims BUSTED They Stole Millions In Govt Ben...,Print They should pay all the back all the mon...,bias,muslims,somalis,0
1,Re Why Did Attorney General Loretta Lynch Plea...,Why Did Attorney General Loretta Lynch Plead T...,bias,loretta lynch,loretta lynch barracuda brigade iran loretta l...,3
2,BREAKING Weiner Cooperating With FBI On Hillar...,Red State Fox News Sunday reported this mornin...,bias,weiner fbi hillary email investigation,red state fox news anthony weiner fbi hillary ...,3
3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,bias,donald j. trump,kayla mueller isis carl mueller donald trump,1
4,FANTASTIC! TRUMPS 7 POINT PLAN To Reform Healt...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,bias,notokes,house senate obamacare obamacare democrats oba...,1


In [100]:
news_clusters = pd.DataFrame(fake_news_content.groupby(['topic_id', 'type']).type.count())
pd.options.display.max_rows = 100
news_clusters

Unnamed: 0_level_0,Unnamed: 1_level_0,type
topic_id,type,Unnamed: 2_level_1
0,bias,63
0,bs,1551
0,conspiracy,45
0,hate,35
0,junksci,30
0,satire,23
0,state,7
1,bias,211
1,bs,2135
1,conspiracy,77


In [80]:
df = pd.DataFrame(fake_news_topics)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
1,0.002941,0.002942,0.366149,0.468045,0.002941,0.145217,0.002941,0.002941,0.002941,0.002941
2,0.004554,0.004547,0.004549,0.679953,0.283663,0.004546,0.004547,0.004547,0.004546,0.004549
3,0.02,0.429468,0.02,0.02,0.221315,0.209217,0.02,0.02,0.02,0.02
4,0.073333,0.653936,0.006667,0.006668,0.006667,0.006667,0.006667,0.112922,0.119806,0.006667


In [82]:
fake_news_topics[1]

array([0.00294137, 0.00294164, 0.36614877, 0.46804508, 0.00294122,
       0.14521709, 0.00294118, 0.00294125, 0.00294118, 0.00294124])

In [83]:
fake_news_content.head()

Unnamed: 0,title,text,type,tokenized_headline,tokenized_content,topic_id
0,Muslims BUSTED They Stole Millions In Govt Ben...,Print They should pay all the back all the mon...,bias,muslims,somalis,0
1,Re Why Did Attorney General Loretta Lynch Plea...,Why Did Attorney General Loretta Lynch Plead T...,bias,loretta lynch,loretta lynch barracuda brigade iran loretta l...,3
2,BREAKING Weiner Cooperating With FBI On Hillar...,Red State Fox News Sunday reported this mornin...,bias,weiner fbi hillary email investigation,red state fox news anthony weiner fbi hillary ...,3
3,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,Email Kayla Mueller was a prisoner and torture...,bias,donald j. trump,kayla mueller isis carl mueller donald trump,1
4,FANTASTIC! TRUMPS 7 POINT PLAN To Reform Healt...,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,bias,notokes,house senate obamacare obamacare democrats oba...,1
