# 1. Setup

## 1.1. Import Libraries

In [None]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import nltk
nltk.download("all")
import matplotlib.pyplot as plt
import torch

%matplotlib inline

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

## 1.3. Download fastText Word Vectors

In [None]:
%%time
URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
FILE = "fastText"

if os.path.isdir(FILE):
    print("fastText exists.")
else:
    !wget -P $FILE $URL
    !unzip $FILE/crawl-300d-2M.vec.zip -d $FILE

--2021-11-22 21:28:18--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 104.22.74.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1523785255 (1.4G) [application/zip]
Saving to: ‘fastText/crawl-300d-2M.vec.zip’


2021-11-22 21:29:07 (30.4 MB/s) - ‘fastText/crawl-300d-2M.vec.zip’ saved [1523785255/1523785255]

Archive:  fastText/crawl-300d-2M.vec.zip
  inflating: fastText/crawl-300d-2M.vec  
CPU times: user 1.01 s, sys: 310 ms, total: 1.32 s
Wall time: 2min 1s


## 1.4. Use GPU for Training

In [None]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla K80


# 2. Data Preparation

## 2.1. Tokenize and remain top frequent words only

In [None]:
from nltk.tokenize import word_tokenize
from collections import defaultdict
from heapq import nlargest


def tokenize(texts, vocabsize = 1000):
    """Tokenize texts, build vocabulary and find maximum sentence length.
    
    Args:
        texts (List[str]): List of text data
        vocabsize (int): Maximum size of vocabulary
    
    Returns:
        tokenized_texts (List[List[str]]): List of list of tokens
        word2idx (Dict): Vocabulary built from the corpus
        max_len (int): Maximum sentence length
    """

    # First, find top vocabsize frequent vocabs
    # Frequency of words appeared in the corpus
    wordcnt = defaultdict(lambda: 0)

    for sent in texts:
      tokenized_sent = word_tokenize(sent)
      
      # Update word count of token to `wordcnt`
      for token in tokenized_sent:
          wordcnt[token] += 1
    
    freqwords = nlargest(vocabsize, wordcnt, key = wordcnt.get)
    

    # Second, tokenize texts, build vocabulary 
    max_len = 0
    tokenized_texts = []
    word2idx = {}

    # Add <pad> and <unk> tokens to the vocabulary
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1

    # Building our vocab from the corpus starting from index 2
    idx = 2
    for sent in texts:
        tokenized_sent = [token for token in word_tokenize(sent) if token in freqwords]

        # Add `tokenized_sent` to `tokenized_texts`
        tokenized_texts.append(tokenized_sent)

        # Add new token to `word2idx`
        for token in tokenized_sent:
            if token not in word2idx:
                word2idx[token] = idx
                idx += 1
            wordcnt[token] += 1


        # Update `max_len`
        max_len = max(max_len, len(tokenized_sent))


    return tokenized_texts, word2idx, max_len

def encode(tokenized_texts, word2idx, max_len):
    """Pad each sentence to the maximum sentence length and encode tokens to
    their index in the vocabulary.

    Returns:
        input_ids (np.array): Array of token indexes in the vocabulary with
            shape (N, max_len). It will the input of our CNN model.
    """

    input_ids = []
    for tokenized_sent in tokenized_texts:
        # Pad sentences to max_len
        if len(tokenized_sent) > max_len:
          tokenized_sent = tokenized_sent[0:max_len]
        tokenized_sent += ['<pad>'] * (max_len - len(tokenized_sent))

        # Encode tokens to input_ids
        input_id = [word2idx.get(token) for token in tokenized_sent]
        input_ids.append(input_id)
    
    return np.array(input_ids)

## 2.2 Load Pretrained Vectors

In [None]:
from tqdm import tqdm_notebook

def load_pretrained_vectors(word2idx, fname):
    """Load pretrained vectors and create embedding layers.
    
    Args:
        word2idx (Dict): Vocabulary built from the corpus
        fname (str): Path to pretrained vector file

    Returns:
        embeddings (np.array): Embedding matrix with shape (N, d) where N is
            the size of word2idx and d is embedding dimension
    """

    print("Loading pretrained vectors...")
    fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())

    # Initilize random embeddings
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))
    embeddings[word2idx['<pad>']] = np.zeros((d,))

    # Load pretrained vectors
    count = 0
    for line in tqdm_notebook(fin):
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        if word in word2idx:
            count += 1
            embeddings[word2idx[word]] = np.array(tokens[1:], dtype=np.float32)

    print(f"There are {count} / {len(word2idx)} pretrained vectors found.")

    return embeddings

## 2.3. Create PyTorch DataLoader

In [None]:
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler,
                              SequentialSampler)

def data_loader(train_inputs, val_inputs, train_labels, val_labels,
                batch_size=50):
    """Convert train and validation sets to torch.Tensors and load them to
    DataLoader.
    """

    # Convert data type to torch.Tensor
    train_inputs, val_inputs, train_labels, val_labels =\
    tuple(torch.tensor(data) for data in
          [train_inputs, val_inputs, train_labels, val_labels])

    # Specify batch_size
    batch_size = 50

    # Create DataLoader for training data
    train_data = TensorDataset(train_inputs, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Create DataLoader for validation data
    val_data = TensorDataset(val_inputs, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    return train_dataloader, val_dataloader

# 3. Model

## 3.1. Create CNN Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN_NLP(nn.Module):
    """An 1D Convulational Neural Network for Sentence Classification."""
    def __init__(self,
                 pretrained_embedding=None,
                 freeze_embedding=False,
                 vocab_size=None,
                 embed_dim=300,
                 filter_sizes=[3, 4, 5],
                 num_filters=[100, 100, 100],
                 num_classes=2,
                 dropout=0.5):
        """
        The constructor for CNN_NLP class.

        Args:
            pretrained_embedding (torch.Tensor): Pretrained embeddings with
                shape (vocab_size, embed_dim)
            freeze_embedding (bool): Set to False to fine-tune pretraiend
                vectors. Default: False
            vocab_size (int): Need to be specified when not pretrained word
                embeddings are not used.
            embed_dim (int): Dimension of word vectors. Need to be specified
                when pretrained word embeddings are not used. Default: 300
            filter_sizes (List[int]): List of filter sizes. Default: [3, 4, 5]
            num_filters (List[int]): List of number of filters, has the same
                length as `filter_sizes`. Default: [100, 100, 100]
            n_classes (int): Number of classes. Default: 2
            dropout (float): Dropout rate. Default: 0.5
        """

        super(CNN_NLP, self).__init__()
        # Embedding layer
        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding,
                                                          freeze=freeze_embedding)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=self.embed_dim,
                                          padding_idx=0,
                                          max_norm=5.0)
        # Conv Network
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        # Fully-connected layer and Dropout
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input_ids):
        """Perform a forward pass through the network.

        Args:
            input_ids (torch.Tensor): A tensor of token ids with shape
                (batch_size, max_sent_length)

        Returns:
            logits (torch.Tensor): Output logits with shape (batch_size,
                n_classes)
        """

        # Get embeddings from `input_ids`. Output shape: (b, max_len, embed_dim)
        x_embed = self.embedding(input_ids).float()

        # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
        # Output shape: (b, embed_dim, max_len)
        x_reshaped = x_embed.permute(0, 2, 1)

        # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]

        # Max pooling. Output shape: (b, num_filters[i], 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]
        
        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (b, sum(num_filters))
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)
        
        # Compute logits. Output shape: (b, n_classes)
        logits = self.fc(self.dropout(x_fc))

        return logits

## 3.2. Optimizer

In [None]:
import torch.optim as optim

def initilize_model(pretrained_embedding=None,
                    freeze_embedding=False,
                    vocab_size=None,
                    embed_dim=300,
                    filter_sizes=[3, 4, 5],
                    num_filters=[100, 100, 100],
                    num_classes=2,
                    dropout=0.5,
                    learning_rate=0.01):
    """Instantiate a CNN model and an optimizer."""

    assert (len(filter_sizes) == len(num_filters)), "filter_sizes and \
    num_filters need to be of the same length."

    # Instantiate CNN model
    cnn_model = CNN_NLP(pretrained_embedding=pretrained_embedding,
                        freeze_embedding=freeze_embedding,
                        vocab_size=vocab_size,
                        embed_dim=embed_dim,
                        filter_sizes=filter_sizes,
                        num_filters=num_filters,
                        num_classes=2,
                        dropout=0.5)
    
    # Send model to `device` (GPU/CPU)
    cnn_model.to(device)

    # Instantiate Adadelta optimizer
    optimizer = optim.Adadelta(cnn_model.parameters(),
                               lr=learning_rate,
                               rho=0.95)

    return cnn_model, optimizer

## 3.3. Training Loop

In [None]:
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility."""

    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, optimizer, train_dataloader, val_dataloader=None, epochs=10):
    """Train the CNN model."""
    
    # Tracking best validation accuracy
    best_accuracy = 0

    # Start training loop
    print("Start training...\n")
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {\
    'Val Acc':^9} | {'Elapsed':^9}")
    print("-"*60)

    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================

        # Tracking time and loss
        t0_epoch = time.time()
        total_loss = 0

        # Put the model into the training mode
        model.train()

        for step, batch in enumerate(train_dataloader):
            # Load batch to GPU
            b_input_ids, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Update parameters
            optimizer.step()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        # =======================================
        #               Evaluation
        # =======================================
        if val_dataloader is not None:
            # After the completion of each training epoch, measure the model's
            # performance on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Track the best accuracy
            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            print(f"{epoch_i + 1:^7} | {avg_train_loss:^12.6f} | {\
            val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            
    print("\n")
    print(f"Training complete! Best accuracy: {best_accuracy:.2f}%.")

def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's
    performance on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled
    # during the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

# 4. Evaluation

## 4.1. Load Dataset

In [None]:
!pip install download

from download import download

download("https://drive.google.com/file/d/1Xvq098S6XKObhDX1IzeZJIqdKKQ5Ey7a/view?usp=sharing", "./train_document.p", replace=True)
download("https://drive.google.com/file/d/1I_fuHBbaTM138_-pdZEzkmSK839jrsKh/view?usp=sharing", "./test_K1_document.p", replace=True)
download("https://drive.google.com/file/d/1hluGv3r6sd4kXVjRevzpb-QKdXmIyQNL/view?usp=sharing", "./test_K2_document.p", replace=True)
download("https://drive.google.com/file/d/1xesLnz4eEuXQUJ8UNeqni6-MES9AFt1p/view?usp=sharing", "./test_K3_document.p", replace=True)
download("https://drive.google.com/file/d/1LpbXtewYPnB5nizn13Q7q_xyNiqK_YUV/view?usp=sharing", "./test_L_document.p", replace=True)

Collecting download
  Downloading download-0.3.5-py3-none-any.whl (8.8 kB)
Installing collected packages: download
Successfully installed download-0.3.5
Downloading data from https://doc-08-1g-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/mk9epvbm1nn9b4e17gbo0ke4l04hsum8/1637616675000/08529154005167969207/*/1Xvq098S6XKObhDX1IzeZJIqdKKQ5Ey7a?e=download (60.6 MB)

file_sizes: 100%|███████████████████████████| 63.5M/63.5M [00:00<00:00, 219MB/s]
Successfully downloaded file to ./train_document.p
Downloading data from https://doc-0g-1g-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/mj3qhbenqf16akpospp3sj6kdbivchkf/1637616675000/08529154005167969207/*/1I_fuHBbaTM138_-pdZEzkmSK839jrsKh?e=download (40.7 MB)

file_sizes: 100%|███████████████████████████| 42.7M/42.7M [00:00<00:00, 195MB/s]
Successfully downloaded file to ./test_K1_document.p
Downloading data from https://doc-0g-1g-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffk

'./test_L_document.p'

In [None]:
import pickle

with open('train_document.p', 'rb') as file: 
    X_train = pickle.load(file)
    y_train = pickle.load(file)
with open('test_K1_document.p',  'rb') as file: 
    X_test_K1 = pickle.load(file)
    y_test_K1 = pickle.load(file)
with open('test_K2_document.p', 'rb') as file: 
    X_test_K2 = pickle.load(file)
    y_test_K2 = pickle.load(file)
with open('test_K3_document.p', 'rb') as file: 
    X_test_K3 = pickle.load(file)
    y_test_K3 = pickle.load(file)
with open('test_L_document.p', 'rb') as file: 
    X_test_L = pickle.load(file)
    y_test_L = pickle.load(file)

## 4.2. CNN model training and testing

In [None]:
def CNN_fakenewsdetect(X_train, y_train, X_test, y_test, max_vocabsize = 1000, max_article_len = 1000, epochs = 10):

  # Due to memory problem, use only 6000 articles from K1 as training data
  texts = np.asarray(pd.concat([X_train[0:3000], X_train[-3000:], X_test]).tolist())
  labels = np.asarray(pd.concat([y_train[0:3000], y_train[-3000:], y_test]).tolist())

  # Tokenize, build vocabulary, encode tokens
  tokenized_texts, word2idx, max_len = tokenize(texts, vocabsize=max_vocabsize)
  input_ids = encode(tokenized_texts, word2idx, min(max_len, max_article_len))
  train_inputs = input_ids[:6000]
  val_inputs = input_ids[6000:]
  train_labels =  labels[:6000]
  val_labels = labels[6000:]

  # Load pretrained vectors
  embeddings = load_pretrained_vectors(word2idx, "fastText/crawl-300d-2M.vec")
  embeddings = torch.tensor(embeddings)

  # Load data to PyTorch DataLoader
  train_dataloader, val_dataloader = data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50)

  # CNN-non-static: fastText pretrained word vectors are fine-tuned during training.
  set_seed(42)
  cnn_non_static, optimizer = initilize_model(pretrained_embedding=embeddings,
                                              freeze_embedding=False,
                                              learning_rate=0.25,
                                              dropout=0.5)
  train(cnn_non_static, optimizer, train_dataloader, val_dataloader, epochs=epochs)

## 4.2.1. Testing K1

In [None]:
# Testing K1

CNN_fakenewsdetect(X_train, y_train, X_test_K1, y_test_K1)

Loading pretrained vectors...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

There are 999 / 1002 pretrained vectors found.
Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   0.221814   |  0.446225  |   77.99   |   38.35  
   2    |   0.067690   |  0.369678  |   82.51   |   38.52  
   3    |   0.045150   |  0.446891  |   81.49   |   39.07  
   4    |   0.031362   |  0.376520  |   83.81   |   39.19  
   5    |   0.024282   |  0.356239  |   84.82   |   39.20  
   6    |   0.019236   |  0.388927  |   84.28   |   39.25  
   7    |   0.015655   |  0.358088  |   85.31   |   39.27  
   8    |   0.013193   |  0.384770  |   84.80   |   39.26  
   9    |   0.012064   |  0.404287  |   84.57   |   39.30  
  10    |   0.008858   |  0.383208  |   85.01   |   39.29  


Training complete! Best accuracy: 85.31%.


In [None]:
# Testing K1

CNN_fakenewsdetect(X_train, y_train, X_test_K1, y_test_K1, max_vocabsize=500, max_article_len=500, epochs=25)

Loading pretrained vectors...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

There are 500 / 502 pretrained vectors found.
Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   0.220724   |  0.417711  |   79.23   |   19.16  
   2    |   0.074402   |  0.351715  |   83.31   |   19.06  
   3    |   0.050919   |  0.421421  |   82.22   |   19.22  
   4    |   0.036007   |  0.361290  |   84.29   |   19.30  
   5    |   0.027641   |  0.333834  |   85.48   |   19.30  
   6    |   0.022959   |  0.359275  |   84.89   |   19.34  
   7    |   0.018553   |  0.339887  |   85.87   |   19.31  
   8    |   0.016309   |  0.346804  |   85.59   |   19.31  
   9    |   0.014359   |  0.379438  |   84.91   |   19.33  
  10    |   0.011175   |  0.378057  |   84.97   |   19.32  
  11    |   0.008436   |  0.373130  |   85.42   |   19.32  
  12    |   0.007374   |  0.399013  |   85.14   |   19.31  
  13    |   0.007307   |  0.370536  |   85.86   |   19.33  
  14    |   0.006323   |  0.359632

In [None]:
# Testing K2

CNN_fakenewsdetect(X_train, y_train, X_test_K2, y_test_K2)

Loading pretrained vectors...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

There are 999 / 1002 pretrained vectors found.
Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   0.220357   |  0.745044  |   55.32   |   29.33  
   2    |   0.065487   |  0.799149  |   52.70   |   29.57  
   3    |   0.043480   |  0.876853  |   49.68   |   29.58  
   4    |   0.030838   |  0.889162  |   50.00   |   29.57  
   5    |   0.022925   |  0.876595  |   52.06   |   29.56  
   6    |   0.018377   |  0.919087  |   50.32   |   29.56  
   7    |   0.014989   |  0.878648  |   53.23   |   29.54  
   8    |   0.012453   |  0.926552  |   50.95   |   29.55  
   9    |   0.011870   |  0.933566  |   50.60   |   29.56  
  10    |   0.008532   |  0.918654  |   52.49   |   29.52  


Training complete! Best accuracy: 55.32%.


In [None]:
# Testing K2

CNN_fakenewsdetect(X_train, y_train, X_test_K2, y_test_K2, max_vocabsize=1000, max_article_len=500, epochs=50)

Loading pretrained vectors...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

There are 999 / 1002 pretrained vectors found.
Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   0.220629   |  0.745173  |   55.64   |   14.28  
   2    |   0.065758   |  0.796225  |   53.33   |   14.14  
   3    |   0.043403   |  0.872270  |   50.19   |   14.24  
   4    |   0.030930   |  0.882015  |   50.65   |   14.26  
   5    |   0.023153   |  0.872841  |   52.45   |   14.31  
   6    |   0.018764   |  0.912180  |   50.96   |   14.28  
   7    |   0.014946   |  0.878401  |   53.45   |   14.35  
   8    |   0.012326   |  0.918107  |   51.65   |   14.31  
   9    |   0.011989   |  0.925251  |   51.34   |   14.30  
  10    |   0.008697   |  0.910709  |   52.89   |   14.32  
  11    |   0.006631   |  0.938197  |   51.78   |   14.33  
  12    |   0.005381   |  0.962991  |   51.09   |   14.34  
  13    |   0.005638   |  0.974268  |   51.03   |   14.34  
  14    |   0.004713   |  0.96758

In [None]:
# Testing K3

CNN_fakenewsdetect(X_train, y_train, X_test_K3, y_test_K3)

Loading pretrained vectors...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

There are 999 / 1002 pretrained vectors found.
Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   0.218497   |  0.709610  |   64.58   |   21.34  
   2    |   0.067251   |  0.678672  |   65.05   |   21.29  
   3    |   0.044830   |  0.747787  |   70.88   |   21.42  
   4    |   0.031591   |  0.716042  |   71.24   |   21.51  
   5    |   0.023531   |  0.718747  |   71.27   |   21.52  
   6    |   0.019732   |  0.757885  |   72.30   |   21.53  
   7    |   0.016078   |  0.722248  |   71.81   |   21.53  
   8    |   0.013069   |  0.789006  |   72.35   |   21.54  
   9    |   0.012481   |  0.828297  |   72.28   |   21.56  
  10    |   0.008636   |  0.789201  |   72.85   |   21.55  


Training complete! Best accuracy: 72.85%.


In [None]:
# Testing Liar

CNN_fakenewsdetect(X_train, y_train, X_test_L, y_test_L)

Loading pretrained vectors...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

There are 999 / 1002 pretrained vectors found.
Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   0.220088   |  0.740681  |   56.10   |   26.24  
   2    |   0.065184   |  0.787031  |   53.23   |   26.30  
   3    |   0.043259   |  0.859614  |   50.28   |   26.45  
   4    |   0.030440   |  0.869926  |   50.35   |   26.51  
   5    |   0.022202   |  0.861321  |   52.46   |   26.52  
   6    |   0.018208   |  0.896044  |   50.94   |   26.55  
   7    |   0.014916   |  0.862739  |   54.03   |   26.55  
   8    |   0.012107   |  0.905739  |   51.26   |   26.54  
   9    |   0.011611   |  0.914670  |   50.80   |   26.54  
  10    |   0.008496   |  0.895792  |   52.87   |   26.53  


Training complete! Best accuracy: 56.10%.


In [None]:
# CNN-non-static: fastText pretrained word vectors are fine-tuned during training.
set_seed(42)
cnn_non_static, optimizer = initilize_model(pretrained_embedding=embeddings,
                                            freeze_embedding=False,
                                            learning_rate=0.25,
                                            dropout=0.5)
train(cnn_non_static, optimizer, train_dataloader, val_dataloader, epochs=10)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   0.218710   |  0.745602  |   55.72   |  508.52  
   2    |   0.063473   |  0.795428  |   53.65   |  506.55  
   3    |   0.042206   |  0.845982  |   51.48   |  504.24  
   4    |   0.030440   |  0.864763  |   51.60   |  503.73  
   5    |   0.023128   |  0.865931  |   52.32   |  503.37  
   6    |   0.017305   |  0.953073  |   48.60   |  504.39  
   7    |   0.016461   |  0.968678  |   48.60   |  503.79  
   8    |   0.013191   |  0.915802  |   51.84   |  503.91  
   9    |   0.010209   |  0.930857  |   51.55   |  503.53  
  10    |   0.007989   |  0.955443  |   50.87   |  503.22  


Training complete! Best accuracy: 55.72%.
