In [4]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

### Import data

In [2]:
data_dir_path = './data'

# Import `fake_or_real_news.csv`
df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")

# Set `y`
y = df.label

# Make training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.33, random_state=53)

In [3]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Fit and transform the training and test data
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

feature_names = count_vectorizer.get_feature_names()

In [15]:
num_topics = 5

# Run LDA
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42).fit(count_train)

In [16]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))
        
num_top_words = 10
display_topics(lda, feature_names, num_top_words)        

Topic 0:
clinton hillary said obama court fbi president state emails law
Topic 1:
people just like world time life years don com know
Topic 2:
said war russia military police people government syria state world
Topic 3:
said sanders obama clinton president state new states year house
Topic 4:
trump said clinton republican party campaign donald people election new


In [17]:
lda.components_.shape

(5, 33179)