In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import nltk
from nltk import word_tokenize
from string import punctuation
import torch.optim as optim
from nltk.corpus import stopwords
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F
import string

if torch.cuda.is_available():
    device = 'cuda'
    print('cuda')
else:
    device = 'cpu'
    print('cpu')
    
nltk.download('punkt')
nltk.download('stopwords')

cuda
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
df = pd.read_csv('/content/drive/MyDrive/fake-news/train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [5]:
df.count()

id        20800
title     20242
author    18843
text      20761
label     20800
dtype: int64

In [6]:
df.dropna(inplace=True)
df.count()

id        18285
title     18285
author    18285
text      18285
label     18285
dtype: int64

In [7]:
print('real news count')
df[df['label']==0].count()

real news count


id        10361
title     10361
author    10361
text      10361
label     10361
dtype: int64

In [8]:
print('real news count')
df[df['label']==0].count()

real news count


id        10361
title     10361
author    10361
text      10361
label     10361
dtype: int64

In [9]:
stopWords = stopwords.words('english')

def preprocessing(raw_text):
    texts = [''.join([c for c in text.lower() if c not in punctuation]) for text in raw_text]
    texts = ''.join(texts)
    texts = [''.join([c for c in text.lower() if c not in '’']) for text in texts]
    texts = ''.join(texts)
    texts = [word for word in word_tokenize(texts) if word not in stopWords]
    texts = ' '.join(texts)
    return texts

df['title'] = df['title'].apply(preprocessing)
df['title'] = df['title'].astype(str)
df['title']

0        house dem aide didnt even see comeys letter ja...
1         flynn hillary clinton big woman campus breitbart
2                                    truth might get fired
3        15 civilians killed single us airstrike identi...
4        iranian woman jailed fictional unpublished sto...
                               ...                        
20795         rapper ti trump poster child white supremacy
20796    nfl playoffs schedule matchups odds new york t...
20797    macys said receive takeover approach hudsons b...
20798          nato russia hold parallel exercises balkans
20799                                      keeps f35 alive
Name: title, Length: 18285, dtype: object

In [10]:
count = [len(df['title'][i]) for i in np.array(df.index)]
pd.DataFrame(count).describe()

Unnamed: 0,0
count,18285.0
mean,60.250369
std,19.168149
min,0.0
25%,49.0
50%,61.0
75%,72.0
max,361.0


In [11]:
class FakeNewsDataset(Dataset):
    def __init__(self):
        self.titles = df["title"].values
        self.labels = df["label"].values
        self.word2id = self.build_vocab()
        self.max_len = 72
  
    def build_vocab(self):
        # added preprocessing: changing to lowercase and removing punctuations
        title_tokens = [word_tokenize(title.lower()) for title in self.titles]
        title_tokens = [token for title_token in title_tokens for token in title_token]
        # removed duplicate tokens so that length of dictionary and the max index of a word remain the same
        tokens = list(set([token for token in title_tokens if token.isalpha()]))
        word2id = {token: i+1 for i, token in enumerate(tokens)}
        return word2id

    def __getitem__(self, idx):
        # load data and labels
        title = self.titles[idx]
        label = self.labels[idx]
        title_vector, label_vector = self.transform(title, label)
        return title_vector, label_vector
  
    def transform(self, title, label):
        tokens = word_tokenize(title.lower())
        tokens = [token.lower() for token in tokens if token.isalpha()]
        title_vector = [self.word2id[token] for token in tokens]
        if self.max_len > len(title_vector):
            diff = self.max_len - len(title_vector)
            pad = [0 for i in range(diff)]
            title_vector = title_vector + pad
        if len(title_vector) > self.max_len:
              title_vector = title_vector[:self.max_len]
        title_vector = torch.Tensor(title_vector).to(torch.int64)
        label_vector = torch.Tensor([label]).to(torch.int64)
        return title_vector, label_vector
  
    def __len__(self):
        return len(self.titles)

In [12]:
fnd = FakeNewsDataset()
train_data_loader = DataLoader(fnd, shuffle=True, batch_size=512)
vocab_length = len(fnd.build_vocab())+1
print(vocab_length)

21555


In [13]:
class PredictFake(torch.nn.Module):
    def __init__(self, vocab_length, embedding_dim, num_layers, num_hidden):
        super(PredictFake, self).__init__()
        self.embedding = nn.Embedding(vocab_length, embedding_dim, max_norm=True)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=num_hidden, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(72*num_hidden, 2)
        self.sig = nn.Sigmoid()
    
    def forward(self, x):
        # Performing a forward pass of our model
        batch_size = x.size(0)
        x = self.embedding(x)
        x, temp = self.lstm(x)
        x = torch.reshape(x, (x.size(0),-1,))
#         x = x.contiguous().view(-1, 256)
#         x = self.dropout(x)
#         x = self.fc(x)
#         x = self.sig(x)
#         x = x.view(batch_size, -1)
#         x = x[:, -1] 

        return F.log_softmax(x,dim=-1)

In [21]:
batch_size = 512
model = PredictFake(vocab_length, embedding_dim=128, num_layers=1, num_hidden=256)
# model.to(device)

In [22]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=0.01)

num_train = round(len(train_data_loader)*0.8)
num_test = round(len(train_data_loader)*0.2)
training_data, testing_data = random_split(train_data_loader, [num_train, num_test])
datasets = {"Training":training_data.dataset, "Validation":testing_data.dataset}

In [23]:
n_epochs = 10
weights_path = '/content/drive/MyDrive/fake-news/weights_path'
val_losses = [np.inf]
no_improvement = 0

for epoch in range(n_epochs):
    for dataset_type in datasets:
        if dataset_type == "Training":
            model.train(True)
        else:
            model.train(False)

        dataset = datasets[dataset_type]
        total_points = 0
        run_loss = 0
        run_accuracy = 0
        for i, sample in enumerate(dataset):
            data, labels = sample
            labels = torch.LongTensor([l for label in labels for l in label])
#             data, labels = data.to(device), labels.to(device) 
            optimizer.zero_grad()
            out = model(data)
            _, pred = torch.max(out, 1)
            n_correct = (pred == labels).sum()
            loss = criterion(out,labels)

            if dataset_type == "Training":
                loss.backward()
                optimizer.step()
            run_loss += loss.item()
            run_accuracy  += n_correct.data.item()
            total_points += len(sample[0])

        print("Epoch {}, {} Loss: {}, Accuracy: {}".format(epoch + 1, dataset_type, run_loss / i, run_accuracy / total_points * 100))
        if dataset_type == "Validation":
            val_loss = run_loss / i
            if all(val_losses < np.array(val_loss)):
#                 torch.save(model.state_dict(), weights_path)
                no_improvement = 0
            else:
                no_improvement += 1
            val_losses.append(val_loss)
            if no_improvement == 3:
                break

Epoch 1, Training Loss: 9.377237129211426, Accuracy: 36.292042657916326
Epoch 1, Validation Loss: 8.981673104422432, Accuracy: 64.61033634126333
Epoch 2, Training Loss: 8.942330850873674, Accuracy: 73.0817610062893
Epoch 2, Validation Loss: 8.924029268537248, Accuracy: 81.0117582718075
Epoch 3, Training Loss: 8.92157393864223, Accuracy: 81.0664479081214
Epoch 3, Validation Loss: 8.919704627990722, Accuracy: 81.57506152584085
Epoch 4, Training Loss: 8.918554142543249, Accuracy: 81.33442712605962
Epoch 4, Validation Loss: 8.91727019718715, Accuracy: 81.6133442712606
Epoch 5, Training Loss: 8.915604591369629, Accuracy: 81.23598578069455
Epoch 5, Validation Loss: 8.911989838736398, Accuracy: 81.59693738036641
Epoch 6, Training Loss: 8.911101613725934, Accuracy: 81.29614438063987
Epoch 6, Validation Loss: 8.910231481279645, Accuracy: 81.59146841673503
Epoch 7, Training Loss: 8.909695380074638, Accuracy: 81.23051681706318
Epoch 7, Validation Loss: 8.909160723005023, Accuracy: 81.585999453103