In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import torch 
import torch.nn as nn
import torch.optim as optim

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torch.utils.data import DataLoader

from tqdm import tqdm

In [12]:
(test_ratio, valid_ratio) = (0.80, 0.80)
device = torch.device("cpu")

In [5]:
df_raw = pd.read_csv("../common-data/fake-news/news.csv")

df_raw["label"] = (df_raw["label"] == "FAKE").astype("int")
df_raw["titletext"] = df_raw["title"] + ". " + df_raw["text"]
df_raw = df_raw.reindex(columns=["label", "title", "text", "titletext"])

df_raw.drop(df_raw[df_raw.text.str.len() < 5].index, inplace=True)

def trim_string(x):
  x = x.split(maxsplit=200)
  x = " ".join(x[:200])
  return x

df_raw["text"] = df_raw["text"].apply(trim_string)
df_raw["titletext"] = df_raw["titletext"].apply(trim_string)

df_real = df_raw[df_raw["label"] == 0]
df_fake = df_raw[df_raw["label"] == 1]

df_real_trainvalid, df_real_test = train_test_split(df_real, train_size=test_ratio, random_state=1)
df_fake_trainvalid, df_fake_test = train_test_split(df_fake, train_size=test_ratio, random_state=1)

df_real_train, df_real_valid = train_test_split(df_real_trainvalid, train_size=valid_ratio, random_state=1)
df_fake_train, df_fake_valid = train_test_split(df_fake_trainvalid, train_size=valid_ratio, random_state=1)

df_train = pd.concat([df_real_train, df_fake_train], ignore_index=True, sort=False)
df_valid = pd.concat([df_real_valid, df_fake_valid], ignore_index=True, sort=False)
df_test = pd.concat([df_real_test, df_fake_test], ignore_index=True, sort=False)

df_train.to_csv("../common-data/fake-news/news_train.csv", index=False)
df_valid.to_csv("../common-data/fake-news/news_valid.csv", index=False)
df_test.to_csv("../common-data/fake-news/news_test.csv", index=False)

In [6]:
df_train = pd.read_csv("../common-data/fake-news/news_train.csv")
df_valid = pd.read_csv("../common-data/fake-news/news_valid.csv")
df_test = pd.read_csv("../common-data/fake-news/news_test.csv")


In [7]:
tokenizer = get_tokenizer("basic_english")

def yield_tokens(df_data):
  for text in tqdm(df_data["titletext"]):
    yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(df_train), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
len(vocab)

100%|██████████| 4029/4029 [00:01<00:00, 3623.11it/s]


39226

In [8]:
def text_pipeline(text):
  return vocab(tokenizer(text))

In [8]:
df_test = df_train.copy()
# df_test = df_test[["label", "titletext"]]
# df_test["titletext"] = df_test["titletext"].apply(text_pipeline).apply(torch.tensor)
# df_test["label"] = df_test["label"].apply(torch.tensor)
df_test["titletext_len"] = df_test["titletext"].apply(text_pipeline).apply(len)
df_test["titletext_len"].max()
# list(df_test.to_records(index=False))

487

In [9]:
def collate_batch(batch):
  label_list, titletext_list, titletext_len_list = [], [], []

  for _label, _titletext in batch:
    label_list.append(_label)
    _titletext = text_pipeline(_titletext)
    _titletext = torch.tensor(_titletext, dtype=torch.int64)
    titletext_list.append(_titletext)
    titletext_len_list.append(len(_titletext))

  label_list = torch.tensor(label_list, dtype=torch.float32)
  titletext_list = pad_sequence(
      titletext_list, batch_first=True)
  titletext_len_list = torch.tensor(titletext_len_list, dtype=torch.int64)

  return label_list.to(device), titletext_list.to(device), titletext_len_list.to(device)


In [10]:
def df_to_dataset(df):
  df = df.copy()
  df = df[["label", "titletext"]]

  return list(df.to_records(index=False))

In [11]:
BS = 32

train_iter = DataLoader(df_to_dataset(
    df_train), batch_size=BS, shuffle=True, collate_fn=collate_batch)
valid_iter = DataLoader(df_to_dataset(
    df_valid), batch_size=BS, shuffle=True, collate_fn=collate_batch)
test_iter = DataLoader(df_to_dataset(
    df_test), batch_size=BS, shuffle=True, collate_fn=collate_batch)

for labels, titletext, titletext_len in train_iter:
  print(labels, titletext, titletext_len); assert False

AssertionError: Torch not compiled with CUDA enabled

In [12]:
class LSTM(nn.Module): 
  def __init__(self, vocab, hidden_dim=128, embed_dim=300):
    super(LSTM, self).__init__()

    self.hidden_dim = hidden_dim
    
    self.embedding = nn.Embedding(len(vocab), embed_dim)
    self.lstm = nn.LSTM(
      input_size=embed_dim,
      hidden_size=hidden_dim,
      num_layers=1,
      batch_first=True,
      bidirectional=True
    )
    self.drop = nn.Dropout(p=0.5)
    self.fc = nn.Linear(2 * hidden_dim, 1)
  
  def forward(self, text, text_len):
    # print(text.size())
    # print(text_len.size())
    text_emb = self.embedding(text)

    # print(text_emb.size())
    packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
    packed_output, _ = self.lstm(packed_input)
    output, _ = pad_packed_sequence(packed_output, batch_first=True)
    # print(output.size(), "sneed")

    out_forward = output[range(len(output)), text_len - 1, :self.hidden_dim]
    # print(out_forward.size())
    out_reverse = output[:, 0, self.hidden_dim:]
    # print(out_reverse.size())
    out_reduced = torch.cat((out_forward, out_reverse), 1)
    # print(out_reduced.size()); assert False

    text_feats = self.drop(out_reduced)
    text_feats = self.fc(text_feats)
    text_feats = torch.squeeze(text_feats, 1)
    
    preds = torch.sigmoid(text_feats)
    
    return preds

In [17]:
def train(
    model,
    optimizer,
    criterion=nn.BCELoss(),
    train_loader=train_iter,
    valid_loader=valid_iter,
    num_epochs=5,
    eval_every=5,
):
  run_loss = 0.0
  valid_run_loss = 0.0
  global_step = 0
  train_loss_list = []
  valid_loss_list = []
  global_step_list = []
  ave_train_loss = None
  ave_valid_loss = None

  model.train()
  for epoch in range(num_epochs):
    with tqdm(train_loader, unit="batch") as tbatch:
      for labels, titletext, titletext_len in tbatch:
        tbatch.set_description(f"Epoch {epoch}")

        output = model(titletext, titletext_len)
        loss = criterion(output, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        run_loss += loss.item()
        global_step += 1

        tbatch.set_postfix(step=f"{ global_step }/{(num_epochs * len(train_loader)) }",
                           train_loss=ave_train_loss,
                           valid_loss=ave_valid_loss)

        if global_step % eval_every == 0:
          model.eval()
          with torch.no_grad():
            for labels, titletext, titletext_len in valid_loader:
              labels = labels.to(device)
              titletext = titletext.to(device)
              titletext_len = titletext_len.to(device)

              output = model(titletext, titletext_len)
              loss = criterion(output, labels)

              valid_run_loss += loss.item()

          ave_train_loss = run_loss / eval_every
          ave_valid_loss = run_loss / len(valid_loader)
          train_loss_list.append(ave_train_loss)
          valid_loss_list.append(ave_valid_loss)
          global_step_list.append(global_step)

          run_loss, valid_run_loss = 0.0, 0.0

          model.train()
          tbatch.set_postfix(step=f"{ global_step }/{(num_epochs * len(train_loader)) }",
                             train_loss=loss.item(),
                             valid_loss=ave_valid_loss)

  print("Finished training")


model = LSTM(vocab=vocab).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

train(model=model, optimizer=optimizer, num_epochs=10)


Epoch 0:   5%|▍         | 6/126 [01:24<28:11, 14.10s/batch, step=6/1260, train_loss=0.681, valid_loss=0.106]

In [None]:
state_dict = {
  "model_state_dict": model.state_dict(),
  "optimizer_state_dict": optimizer.state_dict(),
}

torch.save(state_dict, "../common-data/fake-news/")