In [9]:
import numpy as np
import pandas as pd
import re
import nltk
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Read in data

In [58]:
peek = 10
def present_list_like(name, list_like, peek=peek):
    print(f"{name} peek:")
    print('  ' + '\n  '.join([str(seq) for seq in list_like[0:peek]]))

In [73]:
# read data from covid_lies.csv dataset into dataframe
df = pd.read_csv('./data/covid_lies.csv')
print("The dataset:")
df.info()
print("\nData peek:")
print(df.head(peek))
print()

# seperate out text data and labels
input_text = df['misconception'].to_numpy()
input_label = df['label'].to_numpy()
print("Unique labels:", np.unique(input_label))

The dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6591 entries, 0 to 6590
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   misconception_id  6591 non-null   int64 
 1   misconception     6591 non-null   object
 2   tweet_id          6591 non-null   int64 
 3   label             6591 non-null   object
dtypes: int64(2), object(2)
memory usage: 206.1+ KB

Data peek:
   misconception_id                                      misconception  \
0                 3             Coronavirus is genetically engineered.   
1                30  Blowing conch shells destroys coronavirus pote...   
2                57  Swans and dolphins swimming in Venice canals f...   
3                22                         Cocaine cures coronavirus.   
4                32  Observing janata curfew will result in the red...   
5                25  Holy communion cannot be the cause of the spre...   
6                61  Li

## Preprocess input data

In [74]:
# handle setting all values to our preferred casing style
# removing erroneous punctuation
# handle stop words
# think about standardizing number formats, not fully necessary (careful with 'covid-19')

def preprocess_text(text)->str:
    #Letter-level cleaning
    text = text.lower() #Lowering significantly reduces the number of possible tokens to deal with
    valid_asciis = {9, *range(32, 127)}
    text = ''.join(filter(lambda x: ord(x) in valid_asciis, text)) #Remove irrelevant text(emojis, special characters/symbols, etc)

    #Word/sequence-level cleaning
    text = re.sub(r'\s+', ' ', text) #Remove extra spaces
    text = re.sub(r'http\S+', '', text) #Remove URLs
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text


In [75]:
#Preprocess the text
for i in range(len(input_text)):
    input_text[i] = preprocess_text(input_text[i])

## Tokenize input text data

In [76]:
#Tokenize the text
input_tokens = [nltk.word_tokenize(text) for text in input_text]
present_list_like("Tokenized sentences", input_tokens)

Tokenized sentences peek:
  ['coronavirus', 'genetically', 'engineered', '.']
  ['blowing', 'conch', 'shells', 'destroys', 'coronavirus', 'potency', '.']
  ['swans', 'dolphins', 'swimming', 'venice', 'canals', 'following', 'covid-19', 'lockdown', '.']
  ['cocaine', 'cures', 'coronavirus', '.']
  ['observing', 'janata', 'curfew', 'result', 'reduction', 'covid-19', 'cases', '40', '%', '.']
  ['holy', 'communion', 'can', 'not', 'cause', 'spread', 'coronavirus']
  ['lions', 'freed', 'keep', 'people', 'streets', 'moscow', '.']
  ['coronavirus', 'genetically', 'engineered', '.']
  ['cannabis', 'protects', 'covid-19', '.']
  ['safe', 'individuals', 'infected', 'covid-19', 'go', 'work', '.']


## Form embeddings for input data

In [77]:
#Define a vocab based on the unique tokens in the training set
vocab = set()
for token_set in input_tokens:
    vocab.update(token_set)
vocab = list(vocab)
print(f"Vocabulary size: {len(vocab)}")
present_list_like("Vocabulary", vocab)

Vocabulary size: 260
Vocabulary peek:
  silver-infused
  amounts
  vaccine
  wastes
  protects
  infected
  published
  predicted
  cbd
  solution


In [80]:
vectorizer = CountVectorizer(vocabulary=vocab)
bow_matrix = vectorizer.fit_transform([' '.join(token_set) for token_set in input_tokens])
bow_array = bow_matrix.toarray()
print(
    "Avg Density(mean of the % of non-zero values per embedding):",
    f"{round(np.mean([np.sum(arr > 0)/len(vocab)*100 for arr in bow_array]), 2)}%"
)
present_list_like("Bag-of-Words Matrix", bow_array)

Avg Density(mean of the % of non-zero values per embedding): 1.99%
Bag-of-Words Matrix peek:
  [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0]
  [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

## Define model using PyTorch

## Save Model Weights

## Evaluate Model