In [1]:
import numpy as np
import pandas as pd
import random
import re
import nltk
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch.optim as optim
from nltk.corpus import stopwords
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import precision_score, recall_score, f1_score
import random
import pprint
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
random.seed(184)

In [2]:
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
peek = 20
def present_list_like(name, list_like, peek=peek):
    print(f"{name} peek:")
    print('  ' + '\n  '.join(
        str(v) for v in random.choices(list_like, k=min(peek, len(list_like)))
    ))

In [4]:
columns = [
    'id', 'label', 'claim', 'subject', 'speaker', 'speaker_job_title', 'state_info',
    'party_affiliation', 'barely_true_counts', 'false_counts',
    'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context'
]
present_list_like(f"Dataset columns({len(columns)} in total)", columns, len(columns))
def load_data(split):
    df = pd.read_csv(f"./data/{split}.tsv", sep='\t', names=columns).dropna()
    print("The training dataset:")
    df.info()
    print("\nData peek:")
    print(df.head(peek))
    print()
    return df


Dataset columns(14 in total) peek:
  subject
  subject
  context
  false_counts
  speaker
  mostly_true_counts
  pants_on_fire_counts
  context
  barely_true_counts
  pants_on_fire_counts
  mostly_true_counts
  barely_true_counts
  id
  party_affiliation


In [5]:
pad_tkn = "<PAD>"

In [6]:
def tokenize_text(input_text, known_vector_size=None, token_to_idx={}):
    def preprocess_text(text)->str:
        #Letter-level cleaning
        text = text.lower()
        valid_asciis = {9, *range(32, 127)}
        text = ''.join(filter(lambda x: ord(x) in valid_asciis, text))

        #Word/sequence-level cleaning
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'http\S+', '', text)
        stop_words = set(stopwords.words('english'))
        text = ' '.join(word for word in text.split() if word not in stop_words)
        return text

    #Preprocess the text
    for i in range(len(input_text)):
        input_text[i] = preprocess_text(input_text[i])


    #Tokenize
    final_tokens = input_tokens = [nltk.word_tokenize(text) for text in input_text]
    total_tokens = sum(len(tkns) for tkns in final_tokens)

    # Make all token sets the same length
    forced_tkn_set_size = (
        known_vector_size if known_vector_size
        else int(np.percentile([len(tkns) for tkns in final_tokens], 80))
    )
    final_tokens = [
        tkns[:forced_tkn_set_size] + [pad_tkn]*(forced_tkn_set_size - len(tkns))
        for tkns in final_tokens
    ]

    # Present results
    present_list_like(f"Tokenized sentences({len(final_tokens)} sentences, {total_tokens} total tokens)", final_tokens)


    #Index the tokens
    # Map each token to its frequency in the dataset
    if not len(token_to_idx):
        flat_tokens = [word for token_set in final_tokens for word in token_set]
        frequencies = Counter(flat_tokens)
        token_to_idx = {}
        for idx, (word, _) in enumerate(frequencies.most_common()):
            if idx >= 10000:
                break
            token_to_idx[word] = idx + 1
        if pad_tkn not in token_to_idx:
            token_to_idx[pad_tkn] = len(token_to_idx) + 1
    vocab_size = len(token_to_idx)
    print()
    print(vocab_size, "unique tokens")
    present_list_like("Unique tokens", list(token_to_idx.keys()))

    # Index the tokens
    freq_indexed = [
        [(token_to_idx[token] if token in token_to_idx else 0) for token in token_set]
        for token_set in final_tokens
    ]

    # Present results
    present_list_like(f"\nFinal Index Sets(Set_Size = {forced_tkn_set_size}, {len(freq_indexed)} index sets)", freq_indexed)

    return freq_indexed, token_to_idx

In [7]:
def get_freq_indexed_and_labels(split, known_vector_size=None, token_to_idx={}):
    df = load_data(split)
    input_text = df["claim"].to_numpy()
    input_labels = df["label"].to_numpy()
    freq_indexed, token_to_idx = tokenize_text(input_text, known_vector_size, token_to_idx)

    return freq_indexed, token_to_idx, input_labels

##Turn the data into tensors

In [8]:
def as_tensors(split, label_encoder=None, known_vector_size=None, token_to_idx={}):
    freq_indexed, token_to_idx, input_labels = get_freq_indexed_and_labels(split, known_vector_size, token_to_idx)
    X = torch.tensor(freq_indexed, dtype=torch.long)
    label_encoder_existed = (type(label_encoder) != type(None))
    label_encoder = (LabelEncoder() if not label_encoder_existed else label_encoder)
    y = (
        label_encoder.fit_transform(input_labels) if not label_encoder_existed
        else label_encoder.transform(input_labels)
    )
    y = torch.tensor(y, dtype=torch.long)
    print(f"{split.upper()} SPLIT:", X.size(0), "overall samples:", X.shape)

    return X, token_to_idx, label_encoder, input_labels, y

In [9]:
X_train, token_to_idx, label_encoder, train_input_labels, y_train = as_tensors("train")
train_vocab_size = len(token_to_idx)
input_vector_size = X_train.shape[1]
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

X_test, token_to_idx_test, label_encoder_test, test_input_labels, y_test = as_tensors("test")
test_vocab_size = len(token_to_idx_test)
input_vector_size_test = X_test.shape[1]
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

X_valid, token_to_idx_valid, label_encoder_valid, valid_input_labels, y_valid = as_tensors("valid")
valid_vocab_size = len(token_to_idx_valid)
input_vector_size_valid = X_valid.shape[1]
valid_dataset = TensorDataset(X_valid, y_valid)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=True)

The training dataset:
<class 'pandas.core.frame.DataFrame'>
Index: 6721 entries, 0 to 10239
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    6721 non-null   object 
 1   label                 6721 non-null   object 
 2   claim                 6721 non-null   object 
 3   subject               6721 non-null   object 
 4   speaker               6721 non-null   object 
 5   speaker_job_title     6721 non-null   object 
 6   state_info            6721 non-null   object 
 7   party_affiliation     6721 non-null   object 
 8   barely_true_counts    6721 non-null   float64
 9   false_counts          6721 non-null   float64
 10  half_true_counts      6721 non-null   float64
 11  mostly_true_counts    6721 non-null   float64
 12  pants_on_fire_counts  6721 non-null   float64
 13  context               6721 non-null   object 
dtypes: float64(5), object(9)
memory usage: 787.6+ KB

Data

In [10]:
unique_labels = np.unique(train_input_labels)
print(len(unique_labels), "labels")
train_label_counts = pd.DataFrame({"label": unique_labels})["label"].value_counts(normalize=True)
print(train_label_counts)

6 labels
label
barely-true    0.166667
false          0.166667
half-true      0.166667
mostly-true    0.166667
pants-fire     0.166667
true           0.166667
Name: proportion, dtype: float64


In [11]:
class MisInformationDetectionLSTM(nn.Module):

  def __init__(self, input_size, hidden_size, output_size, num_layers, dropout):

    super(MisInformationDetectionLSTM, self).__init__()
    self.embedding = nn.Embedding(vocab_size+1, embed_size, padding_idx=0)
    self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first= True, dropout = dropout)
    self.fc = nn.Linear(hidden_size, output_size)
    self.softmax = nn.Softmax(dim = 1)

  def forward(self, x):
    embedded = self.embedding(x)
    _, (hidden, _) = self.lstm(embedded)
    output = self.fc(hidden[-1])
    return self.softmax(output)

vocab_size = len(token_to_idx)
embed_size = 256
input_size = embed_size
print(vocab_size)

hidden_size = 256
output_size = len(unique_labels)
num_layers = 3
dropout = 0.2
print(output_size)

model = MisInformationDetectionLSTM(input_size, hidden_size, output_size, num_layers, dropout)

'''
counter_data = Counter(input_label)
class_counts = list(counter_data.values())
print(class_counts)


class_counts = train_label_counts
class_weights = 1./ torch.tensor(class_counts, dtype = torch.float)
print(class_weights)
#criterion = nn.CrossEntropyLoss(weight = class_weights)

criterion = nn.CrossEntropyLoss(weight = torch.tensor([1.5, 1.2,1.0]))'''
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

def train_model(model, train_loader, valid_loader, criterion, optimizer, epochs = 10):

  for epoch in range(epochs):

    model.train()
    total_loss = 0

    for inputs, labels in train_loader:

      optimizer.zero_grad()
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()
      total_loss+=loss.item()

    print(f'Epoch {epoch+1}, Training loss : {total_loss/ len(train_loader):.4f}')

    model.eval()

    correct, total = 0,0

    with torch.no_grad():

      for inputs, labels in valid_loader:

        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct+= (predicted == labels).sum().item()
    print(f"Validation Accuracy: {correct / total* 100:.2f}%")

train_model(model, train_loader, valid_loader, criterion, optimizer, epochs = 30)



from sklearn.metrics import classification_report, confusion_matrix

def evaluate_model(model, val_loader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.tolist())
            y_pred.extend(predicted.tolist())

    print(classification_report(y_true, y_pred, target_names = unique_labels))
    print(confusion_matrix(y_true, y_pred))

evaluate_model(model, valid_loader)

10000
6
Epoch 1, Training loss : 1.7755
Validation Accuracy: 21.72%
Epoch 2, Training loss : 1.7669
Validation Accuracy: 22.88%
Epoch 3, Training loss : 1.7638
Validation Accuracy: 21.84%
Epoch 4, Training loss : 1.7560
Validation Accuracy: 21.25%
Epoch 5, Training loss : 1.7372
Validation Accuracy: 21.02%
Epoch 6, Training loss : 1.7149
Validation Accuracy: 22.07%
Epoch 7, Training loss : 1.6840
Validation Accuracy: 20.09%
Epoch 8, Training loss : 1.6563
Validation Accuracy: 21.95%
Epoch 9, Training loss : 1.6289
Validation Accuracy: 19.74%
Epoch 10, Training loss : 1.6055
Validation Accuracy: 21.95%
Epoch 11, Training loss : 1.5855
Validation Accuracy: 20.79%
Epoch 12, Training loss : 1.5606
Validation Accuracy: 20.79%
Epoch 13, Training loss : 1.5493
Validation Accuracy: 19.74%
Epoch 14, Training loss : 1.5455
Validation Accuracy: 19.74%
Epoch 15, Training loss : 1.5293
Validation Accuracy: 19.86%
Epoch 16, Training loss : 1.5150
Validation Accuracy: 18.82%
Epoch 17, Training loss :

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
class MisInformationDetectionBiLSTM(nn.Module):

  def __init__(self, input_size, hidden_size, output_size, num_layers = 1, dropout = 0.2):

    super(MisInformationDetectionBiLSTM, self).__init__()
    self.embedding = nn.Embedding(vocab_size+1, embed_size, padding_idx=0)
    self.bilstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first= True, dropout = dropout, bidirectional = True)
    self.fc = nn.Linear(hidden_size *2, output_size)
    self.softmax = nn.Softmax(dim = 1)

  def forward(self, x):
    embedded = self.embedding(x)
    _, (hidden, _) = self.bilstm(embedded)
    hidden = torch.cat((hidden[-2], hidden[-1]), dim = 1)
    output = self.fc(hidden)
    return self.softmax(output)

model_bilstm = MisInformationDetectionBiLSTM(input_size, hidden_size, output_size, num_layers, dropout)

train_model(model_bilstm, train_loader, valid_loader, criterion, optimizer, epochs = 30)

evaluate_model(model_bilstm, valid_loader)





Epoch 1, Training loss : 1.7920
Validation Accuracy: 20.56%
Epoch 2, Training loss : 1.7920
Validation Accuracy: 20.56%
Epoch 3, Training loss : 1.7920
Validation Accuracy: 20.56%
Epoch 4, Training loss : 1.7920
Validation Accuracy: 20.56%
Epoch 5, Training loss : 1.7920
Validation Accuracy: 20.56%
Epoch 6, Training loss : 1.7920
Validation Accuracy: 20.56%
Epoch 7, Training loss : 1.7920
Validation Accuracy: 20.56%
