In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pickle as pkl

# import data

In [8]:
data_dir_path = './datasets'

# Import `fake_or_real_news.csv`
df = pd.read_csv(data_dir_path + "/train.csv")

# Set `y`
X_train = df['text']
y = df.label

df.head()

In [10]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')

# Fit and transform the training data
count_train = count_vectorizer.fit_transform(X_train.astype('U'))
feature_names = count_vectorizer.get_feature_names()

# run lda

In [13]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx), end=' ')
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))



# Run LDA
num_topics = 10
# lda = LatentDirichletAllocation(n_components=num_topics, random_state=42).fit(count_train)
# pkl.dump(lda, open('lda_10.pkl', 'wb'))
lda = pkl.load(open('lda_10.pkl', 'rb'))
print(lda.components_.shape)

# display        
num_top_words = 10
display_topics(lda, feature_names, num_top_words)        

(10, 5000)
Topic 0: said like just time people new years don life way
Topic 1: trump president clinton donald said election people party campaign republican
Topic 2: school students clinton university state million foundation public money education
Topic 3: people percent world like new years government money economic year
Topic 4: russia war military syria united russian government states american world
Topic 5: said law court federal health state new states care immigration
Topic 6: clinton hillary fbi media news election emails investigation email comey
Topic 7: said police people city state officers man killed officials according
Topic 8: mr said ms trump new president officials united company did
Topic 9: la twitter el 2017 en que obama com 2016 european


# load classifier

In [67]:
from getEmbeddings import getEmbeddings
import matplotlib.pyplot as plt
import numpy as np
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from skorch import NeuralNetClassifier
import torch
import torch.nn as nn
import torch.nn.functional as F


class FNN(nn.Module):
    def __init__(self):
        super(FNN, self).__init__()
        self.fc1 = nn.Linear(300, 256)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(256, 256)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(256, 80)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(80, 2)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.fc4(x)
        x = F.softmax(x, dim=1)
        return x


'''
print('getting embdeddings...')
xtr,xte,ytr,yte = getEmbeddings("datasets/train.csv")
np.save('./xtr', xtr)
np.save('./xte', xte)
np.save('./ytr', ytr)
np.save('./yte', yte)
'''

# prepare data
xtr = np.load('./xtr.npy').astype(np.float32)
xte = np.load('./xte.npy').astype(np.float32)
ytr = np.load('./ytr.npy')
yte = np.load('./yte.npy')
x_train, x_test, y_train, y_test = train_test_split(xtr, ytr, test_size=0.2, random_state=42)
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y = np_utils.to_categorical((label_encoder.transform(y_train))).astype(np.int64)
encoded_y = np.argmax(encoded_y, axis=1)
label_encoder.fit(y_test)
encoded_y_test = np_utils.to_categorical((label_encoder.transform(y_test))).astype(np.int64)

# fit model
net = NeuralNetClassifier(
    FNN,
    max_epochs=10,
    lr=0.1,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
    train_split=None,
)
net.fit(x_train, encoded_y)
print("Model Trained!")
pkl.dump(net, open('model.pkl', 'wb'))

  epoch    train_loss     dur
-------  ------------  ------
      1        [36m0.6273[0m  0.7848
      2        [36m0.3111[0m  0.7765
      3        [36m0.2602[0m  0.7738
      4        [36m0.2406[0m  0.7747
      5        [36m0.2307[0m  0.7816
      6        [36m0.2154[0m  0.7750
      7        [36m0.2062[0m  0.7731
      8        [36m0.1935[0m  0.7811
      9        [36m0.1900[0m  0.7511
     10        [36m0.1812[0m  0.7797
Model Trained!


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [68]:
# evaluate model
net = pkl.load(open('model.pkl', 'rb'))
probabs = net.predict_proba(x_test)
y_pred = np.argmax(probabs, axis=1)
acc = np.mean(y_pred == np.argmax(encoded_y_test, axis=1))
print(f"acc: {acc:0.2f}")

acc: 0.92
