In [10]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pickle as pkl

### Import data

In [2]:
data_dir_path = './data'

# Import `fake_or_real_news.csv`
df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")

# Set `y`
y = df.label

# Make training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.33, random_state=53)

In [6]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Fit and transform the training and test data
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

feature_names = count_vectorizer.get_feature_names()

# run lda

In [12]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))



# Run LDA
num_topics = 10
# lda = LatentDirichletAllocation(n_components=num_topics, random_state=42).fit(count_train)
# pkl.dump(lda, open('lda_10.pkl', 'wb'))
lda = pkl.load(open('lda_10.pkl', 'rb'))
print(lda.components_.shape)

# display        
num_top_words = 10
display_topics(lda, feature_names, num_top_words)        

(10, 33179)
Topic 0:
clinton hillary fbi said emails president obama state department email
Topic 1:
people just world like new percent time years health year
Topic 2:
war russia military syria said russian government isis state world
Topic 3:
clinton said trump sanders campaign obama republican party democratic new
Topic 4:
bush rubio cruz republican political debate party trump said candidates
Topic 5:
said house court israel state marriage religious boehner congress ryan
Topic 6:
women saudi iran people states drug year veterans government war
Topic 7:
said police people black told just year law day man
Topic 8:
com obama infowars force http brain www text retired new
Topic 9:
trump clinton people hillary donald election like american just president


# load classifier

In [None]:
from sklearn import metrics
import pandas as pd
from sklearn.model_selection import train_test_split
from keras_fake_news_detector.library.classifiers.feedforward_networks import GloveFeedforwardNet
import numpy as np



np.random.seed(42)
data_dir_path = './data'
very_large_data_dir_path = './very_large_data'
model_dir_path = './models'
config_file_path = model_dir_path + '/' + GloveFeedforwardNet.model_name + '-config.npy'
weight_file_path = model_dir_path + '/' + GloveFeedforwardNet.model_name + '-weights.h5'

print('loading csv file ...')

# Import `fake_or_real_news.csv`
df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")

# Set `y`
Y = [1 if label == 'REAL' else 0 for label in df.label]
# Drop the `label` column
df.drop("label", axis=1)

X = df['text']

config = np.load(config_file_path, allow_pickle=True).item()

classifier = GloveFeedforwardNet(config)
classifier.load_weights(weight_file_path)
classifier.load_glove(very_large_data_dir_path)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

print('testing size: ', len(Xtest))

print('start predicting ...')
pred = classifier.predict(Xtest)
print(pred)
score = metrics.accuracy_score(Ytest, pred)
print("accuracy:   %0.3f" % score)

loading csv file ...
