In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pickle as pkl

### Import data

In [34]:
data_dir_path = './data'

# Import `fake_or_real_news.csv`
df = pd.read_csv(data_dir_path + "/train.csv")

# Set `y`
X_train = df['text']
y = df.label

In [35]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [36]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')

# Fit and transform the training data
count_train = count_vectorizer.fit_transform(X_train.astype('U'))
feature_names = count_vectorizer.get_feature_names()

# run lda

In [37]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))



# Run LDA
num_topics = 10
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42).fit(count_train)
# pkl.dump(lda, open('lda_10.pkl', 'wb'))
# lda = pkl.load(open('lda_10.pkl', 'rb'))
print(lda.components_.shape)

# display        
num_top_words = 10
display_topics(lda, feature_names, num_top_words)        

(10, 5000)
Topic 0:
said like just time people new years don life way
Topic 1:
trump president clinton donald said election people party campaign republican
Topic 2:
school students clinton university state million foundation public money education
Topic 3:
people percent world like new years government money economic year
Topic 4:
russia war military syria united russian government states american world
Topic 5:
said law court federal health state new states care immigration
Topic 6:
clinton hillary fbi media news election emails investigation email comey
Topic 7:
said police people city state officers man killed officials according
Topic 8:
mr said ms trump new president officials united company did
Topic 9:
la twitter el 2017 en que obama com 2016 european
