In [6]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pickle as pkl
from sklearn import metrics
from keras_fake_news_detector.library.classifiers.feedforward_networks import GloveFeedforwardNet
import torch
import torch.nn as nn
import torch.nn.functional as F

# import data

In [7]:
data_dir_path = './data'

# Import `fake_or_real_news.csv`
df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")

# Set `y`
y = df.label

# Make training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.33, random_state=53)

In [8]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Fit and transform the training and test data
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

feature_names = count_vectorizer.get_feature_names()

# run lda

In [None]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))



# Run LDA
num_topics = 10
# lda = LatentDirichletAllocation(n_components=num_topics, random_state=42).fit(count_train)
# pkl.dump(lda, open('lda_10.pkl', 'wb'))
lda = pkl.load(open('lda_10.pkl', 'rb'))
print(lda.components_.shape)

# display        
num_top_words = 10
display_topics(lda, feature_names, num_top_words)        

# predict with classifier

In [10]:
np.random.seed(42)
data_dir_path = './data'
very_large_data_dir_path = './very_large_data'
model_dir_path = './models'
config_file_path = model_dir_path + '/' + GloveFeedforwardNet.model_name + '-config.npy'
weight_file_path = model_dir_path + '/' + GloveFeedforwardNet.model_name + '-weights.h5'

print('loading csv file ...')

# Import `fake_or_real_news.csv`
df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")

# Set `y`
Y = [1 if label == 'REAL' else 0 for label in df.label]
# Drop the `label` column
df.drop("label", axis=1)

X = df['text']

config = np.load(config_file_path, allow_pickle=True).item()

classifier = GloveFeedforwardNet(config)
classifier.load_weights(weight_file_path)
classifier.load_glove(very_large_data_dir_path)

loading csv file ...


In [11]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)
print('testing size: ', len(Xtest))

print('start predicting ...')
pred = classifier.predict(Xtest)
# print(pred)
score = metrics.accuracy_score(Ytest, pred)
print("accuracy:   %0.3f" % score)

testing size:  1267
start predicting ...
records:  1267


W0103 07:23:47.364650 139668605216576 deprecation_wrapper.py:119] From /system/linux/anaconda3.7/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.



accuracy:   0.841


In [12]:
x = Xtrain[0] # this is a string

In [None]:
class FNN(nn.Module):

    def __init__(self, classifier):
        super(FNN, self).__init__()
        self.transform_input_text = classifier.transform_input_text

    def forward(self, x: list):
        # convert string to vector
        x = self.transform_input_text(x)
        
        return x

net = FNN(classifier)
print(net)
print(x, net(x))

In [6]:
classifier.model

<keras.engine.sequential.Sequential at 0x7f0e65676f50>