In [63]:
import numpy as np
import pandas as pd
import re
import nltk
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch.optim as optim
from nltk.corpus import stopwords
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import random
import pprint

In [64]:
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Read in data

In [65]:
peek = 10
def present_list_like(name, list_like, peek=peek):
    print(f"{name} peek:")
    print('  ' + '\n  '.join([str(seq) for seq in list_like[0:peek]]))

In [66]:
#Read data from covid_lies.csv dataset into dataframe
df = pd.read_csv('./data/covid_lies.csv')
print("The dataset:")
df.info()
print("\nData peek:")
print(df.head(peek))
print()

#Seperate out text data and labels
input_text = df['misconception'].to_numpy()
input_label = df['label'].to_numpy()
print("Unique labels:", np.unique(input_label))
orig_label_counts = df['label'].value_counts(normalize=True)
print("Counts of 'labels':", orig_label_counts)

The dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6591 entries, 0 to 6590
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   misconception_id  6591 non-null   int64 
 1   misconception     6591 non-null   object
 2   tweet_id          6591 non-null   int64 
 3   label             6591 non-null   object
dtypes: int64(2), object(2)
memory usage: 206.1+ KB

Data peek:
   misconception_id                                      misconception  \
0                 3             Coronavirus is genetically engineered.   
1                30  Blowing conch shells destroys coronavirus pote...   
2                57  Swans and dolphins swimming in Venice canals f...   
3                22                         Cocaine cures coronavirus.   
4                32  Observing janata curfew will result in the red...   
5                25  Holy communion cannot be the cause of the spre...   
6                61  Li

In [67]:
#Balance the dataset

from sklearn.utils import resample

df_majority = df[df['label'] == 'na']
df_minority_1 = df[df['label'] == 'pos']
df_minority_2 = df[df['label'] == 'neg']

# Upsample minority classes
df_minority_1_upsampled = resample(
    df_minority_1, replace=True, n_samples=len(df_majority),  random_state=42
)
df_minority_2_upsampled = resample(
    df_minority_2, replace=True, n_samples=len(df_majority), random_state=42
)

# Create the balanced df out of the new sample sets
df_balanced = pd.concat([df_majority, df_minority_1_upsampled, df_minority_2_upsampled])

# Present the new df
print("The dataset:")
df_balanced.info()
print("\nData peek:")
print(df_balanced.head(peek))
print()

# Seperate out text data and labels
input_text = df_balanced['misconception'].to_numpy()
input_label = df_balanced['label'].to_numpy()
print("Unique labels:", np.unique(input_label))
print("Counts of 'labels':", df_balanced['label'].value_counts(normalize=True))

The dataset:
<class 'pandas.core.frame.DataFrame'>
Index: 18447 entries, 0 to 2179
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   misconception_id  18447 non-null  int64 
 1   misconception     18447 non-null  object
 2   tweet_id          18447 non-null  int64 
 3   label             18447 non-null  object
dtypes: int64(2), object(2)
memory usage: 720.6+ KB

Data peek:
   misconception_id                                      misconception  \
0                 3             Coronavirus is genetically engineered.   
1                30  Blowing conch shells destroys coronavirus pote...   
2                57  Swans and dolphins swimming in Venice canals f...   
3                22                         Cocaine cures coronavirus.   
4                32  Observing janata curfew will result in the red...   
5                25  Holy communion cannot be the cause of the spre...   
6                61  Lions 

## Preprocess input data

In [68]:
def preprocess_text(text)->str:
    #Letter-level cleaning
    text = text.lower()
    valid_asciis = {9, *range(32, 127)}
    text = ''.join(filter(lambda x: ord(x) in valid_asciis, text))

    #Word/sequence-level cleaning
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'http\S+', '', text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text


In [69]:
#Preprocess the text
for i in range(len(input_text)):
    input_text[i] = preprocess_text(input_text[i])

## Tokenize input text data

In [70]:
#Tokenize the text

input_tokens = [nltk.word_tokenize(text) for text in input_text]

# Split tokens based on non-alphanumeric characters
final_tokens = []
total_tokens = 0
for token_set in input_tokens:
    final_tkn_set = []
    for tk in token_set:
        sub_tkns = [c for c in re.split(r"(\W+)", tk) if c]
        total_tokens += len(sub_tkns)
        final_tkn_set += sub_tkns
    final_tokens.append(final_tkn_set)
present_list_like(f"Tokenized sentences({len(final_tokens)} sentences, {total_tokens} tokens)", final_tokens)

Tokenized sentences(18447 sentences, 133252 tokens) peek:
  ['coronavirus', 'genetically', 'engineered', '.']
  ['blowing', 'conch', 'shells', 'destroys', 'coronavirus', 'potency', '.']
  ['swans', 'dolphins', 'swimming', 'venice', 'canals', 'following', 'covid', '-', '19', 'lockdown', '.']
  ['cocaine', 'cures', 'coronavirus', '.']
  ['observing', 'janata', 'curfew', 'result', 'reduction', 'covid', '-', '19', 'cases', '40', '%', '.']
  ['holy', 'communion', 'can', 'not', 'cause', 'spread', 'coronavirus']
  ['lions', 'freed', 'keep', 'people', 'streets', 'moscow', '.']
  ['coronavirus', 'genetically', 'engineered', '.']
  ['cannabis', 'protects', 'covid', '-', '19', '.']
  ['safe', 'individuals', 'infected', 'covid', '-', '19', 'go', 'work', '.']


## Form embeddings for input data

In [71]:
#Embed the tokens

# Map each token to its frequency in the dataset
flat_tokens = [word for token_set in final_tokens for word in token_set]
frequencies = Counter(flat_tokens)
token_to_idx = {word: idx+1 for idx, (word, _) in enumerate(frequencies.most_common())}
vocab_size = len(token_to_idx)
print(vocab_size, "unique tokens")
present_list_like("Unique tokens", list(token_to_idx.keys()))

# Embed the tokens
freq_indexed = [[token_to_idx[token] for token in token_set] for token_set in final_tokens]

# Make embeddings the same size
forced_idx_set_size = max(len(idxs) for idxs in freq_indexed)
freq_indexed = [
    idxs[:forced_idx_set_size] + [0]*(forced_idx_set_size - len(idxs))
    for idxs in freq_indexed
]
present_list_like(f"\nFinal Index Sets(Set_Size = {forced_idx_set_size}, {len(freq_indexed)} index sets)", freq_indexed)

266 unique tokens
Unique tokens peek:
  .
  -
  coronavirus
  covid
  19
  deadly
  seasonal
  flu
  survive
  cure

Final Index Sets(Set_Size = 19, 18447 index sets) peek:
  [3, 15, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  [134, 135, 136, 137, 3, 138, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  [165, 166, 167, 99, 100, 62, 4, 2, 5, 63, 1, 0, 0, 0, 0, 0, 0, 0, 0]
  [168, 30, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  [241, 242, 243, 244, 245, 4, 2, 5, 246, 247, 248, 1, 0, 0, 0, 0, 0, 0, 0]
  [22, 23, 17, 18, 24, 19, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  [169, 170, 171, 172, 96, 173, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  [3, 15, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  [174, 66, 4, 2, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  [108, 109, 74, 4, 2, 5, 110, 111, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## Define and train a GRU model using PyTorch

In [72]:
#Define the model

import torch
import torch.nn as nn

class GRUModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sm = nn.Softmax(dim=-1)

    def forward(self, x):
        _, hidden = self.gru(self.embedding(x))
        fc = self.fc(hidden[-1])
        return self.sm(fc)

In [73]:
#Training fn
def train_model(model, dataloader, optimizer, criterion, device, epochs=10):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        epoch_accuracy = 0

        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            predictions = model(inputs).squeeze()
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item() #Track total loss
            #Track total accuracy
            _, predicted_classes = torch.max(predictions, 1)
            accuracy = (predicted_classes == labels).sum().item() / labels.size(0)
            epoch_accuracy += accuracy

        print(f"Epoch {epoch+1}/{epochs} | Loss: {epoch_loss/len(dataloader):.4f} | Accuracy: {epoch_accuracy/len(dataloader):.4f}")

In [74]:
#Setup to train

# Model and training structure
INPUT_DIM = vocab_size + 1
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 3 #pos, neg, na
N_LAYERS = 2
DROPOUT = 0.5
BATCH_SIZE = 32
EPOCHS = 15

# Make the model
gru_model = GRUModel(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT)

# Optimization & loss setup
optimizer = torch.optim.Adam(gru_model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Move model to GPU if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
gru_model = gru_model.to(device)
criterion = criterion.to(device)

# Setup the training dataset
X = torch.tensor(freq_indexed, dtype=torch.long)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(input_label)
y = torch.tensor(y, dtype=torch.long)
print(X.size(0), "overall samples of shape", X.shape)

X_train, X_test, y_train, y_test = train_test_split(X.numpy(), y.numpy(), test_size=0.2, random_state=42)
X_train, X_test = torch.tensor(X_train, dtype=torch.long), torch.tensor(X_test, dtype=torch.long)
y_train, y_test = torch.tensor(y_train, dtype=torch.long), torch.tensor(y_test, dtype=torch.long)
print(X_train.size(0), "training samples of shape", X_train.shape)
print(y_test.size(0), "validation samples of shape", X_test.shape)
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

18447 overall samples of shape torch.Size([18447, 19])
14757 training samples of shape torch.Size([14757, 19])
3690 validation samples of shape torch.Size([3690, 19])


In [None]:
#Train the model
train_model(gru_model, train_loader, optimizer, criterion, device, EPOCHS)

## Save Model Weights

In [75]:
#Save the model weights
torch.save(gru_model.state_dict(), "gru_model_weights.pth")
print("Model weights saved to 'gru_model_weights.pth'")

Model weights saved to 'gru_model_weights.pth'


## Evaluate Model