In [87]:
import numpy as np
import pandas as pd
import random
import re
import nltk
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch.optim as optim
from nltk.corpus import stopwords
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import precision_score, recall_score, f1_score
import random
import pprint
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
random.seed(184)

In [88]:
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Read in data

In [89]:
peek = 20
def present_list_like(name, list_like, peek=peek):
    print(f"{name} peek:")
    print('  ' + '\n  '.join( str(v) for v in list_like[:peek]))

In [90]:
columns = [
    'id', 'label', 'claim', 'subject', 'speaker', 'speaker_job_title', 'state_info',
    'party_affiliation', 'barely_true_counts', 'false_counts',
    'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context'
]
present_list_like(f"Dataset columns({len(columns)} in total)", columns, len(columns))
def load_data(split):
    df = pd.read_csv(f"./data/{split}.tsv", sep='\t', names=columns)
    df = df.drop(index=[
        idx for idx in df.index if type(df["claim"][idx]) == type(None) or not len(df["claim"][idx])
    ])
    print("The training dataset:")
    df.info()
    print("\nData peek:")
    print(df.head(peek))
    print()
    return df

Dataset columns(14 in total) peek:
  id
  label
  claim
  subject
  speaker
  speaker_job_title
  state_info
  party_affiliation
  barely_true_counts
  false_counts
  half_true_counts
  mostly_true_counts
  pants_on_fire_counts
  context


##Tokenize the data

In [91]:
pad_tkn = "<PAD>"

In [92]:
def tokenize_text(input_text, known_vector_size=None, token_to_idx={}):
    def preprocess_text(text)->str:
        #Letter-level cleaning
        text = text.lower()
        valid_asciis = {9, *range(32, 127)}
        text = ''.join(filter(lambda x: ord(x) in valid_asciis, text))

        #Word/sequence-level cleaning
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'http\S+', '', text)
        stop_words = set(stopwords.words('english'))
        text = ' '.join(word for word in text.split() if word not in stop_words)
        return text

    #Preprocess the text
    for i in range(len(input_text)):
        input_text[i] = preprocess_text(input_text[i])


    #Tokenize
    final_tokens = input_tokens = [nltk.word_tokenize(text) for text in input_text]
    total_tokens = sum(len(tkns) for tkns in final_tokens)

    # Make all token sets the same length
    forced_tkn_set_size = (
        known_vector_size if known_vector_size
        else int(np.percentile([len(tkns) for tkns in final_tokens], 80))
    )
    final_tokens = [
        tkns[:forced_tkn_set_size] + [pad_tkn]*(forced_tkn_set_size - len(tkns))
        for tkns in final_tokens
    ]

    # Present results
    present_list_like(f"Tokenized sentences({len(final_tokens)} sentences, {total_tokens} total tokens)", final_tokens)


    #Index the tokens
    # Map each token to its frequency in the dataset
    if not len(token_to_idx):
        flat_tokens = [word for token_set in final_tokens for word in token_set]
        frequencies = Counter(flat_tokens)
        token_to_idx = {}
        for idx, (word, _) in enumerate(frequencies.most_common()):
            if idx >= 10000:
                break
            token_to_idx[word] = idx + 1
        if pad_tkn not in token_to_idx:
            token_to_idx[pad_tkn] = len(token_to_idx) + 1
    vocab_size = len(token_to_idx)
    print()
    print(vocab_size, "unique tokens")
    present_list_like("Unique tokens", list(token_to_idx.keys()))

    # Index the tokens
    freq_indexed = [
        [(token_to_idx[token] if token in token_to_idx else 0) for token in token_set]
        for token_set in final_tokens
    ]

    # Present results
    present_list_like(f"\nFinal Index Sets(Set_Size = {forced_tkn_set_size}, {len(freq_indexed)} index sets)", freq_indexed)

    return freq_indexed, token_to_idx

In [93]:
def get_freq_indexed_and_labels(split, known_vector_size=None, token_to_idx={}):
    df = load_data(split)
    input_text = df["claim"].to_numpy()
    #Augment input text with the other columns
    other_cols = {
        "context",
        "subject",
        "speaker",
        "speaker_job_title",
        "state_info",
        "party_affiliation",
    }
    for i in range(len(input_text)):
        extra_data = [f"{col}: {df[col].values[i]}" for col in other_cols if df[col].values[i]]
        input_text[i] += " | \n"*(len(extra_data) > 0) + " | \n".join(extra_data)
    input_labels = df["label"].to_numpy()
    code_switch = """"""
    #Fuse some labels
    input_labels = np.array([
        "false" if x in ("false", "half-true", "barely-true", "pants-fire")
        else "true" if x in ("true", "mostly-true")
        else x
        for x in input_labels
    ])
    #"""
    freq_indexed, token_to_idx = tokenize_text(input_text, known_vector_size, token_to_idx)

    return freq_indexed, token_to_idx, input_labels

##Turn the data into tensors

In [94]:
def as_tensors(split, label_encoder=None, known_vector_size=None, token_to_idx={}):
    freq_indexed, token_to_idx, input_labels = get_freq_indexed_and_labels(split, known_vector_size, token_to_idx)
    X = torch.tensor(freq_indexed, dtype=torch.long)
    label_encoder_existed = (type(label_encoder) != type(None))
    label_encoder = (LabelEncoder() if not label_encoder_existed else label_encoder)
    y = (
        label_encoder.fit_transform(input_labels) if not label_encoder_existed
        else label_encoder.transform(input_labels)
    )
    y = torch.tensor(y, dtype=torch.long)
    print(f"{split.upper()} SPLIT:", X.size(0), "overall samples:", X.shape)

    return X, token_to_idx, label_encoder, input_labels, y

##Training

In [95]:
BATCH_SIZE = 32

In [96]:
X_train, token_to_idx, label_encoder, train_input_labels, y_train = as_tensors("train")
label_to_idx = {l: i for i, l in enumerate(label_encoder.classes_)}
train_vocab_size = len(token_to_idx)
input_vector_size = X_train.shape[1]
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

The training dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10240 entries, 0 to 10239
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10240 non-null  object 
 1   label                 10240 non-null  object 
 2   claim                 10240 non-null  object 
 3   subject               10238 non-null  object 
 4   speaker               10238 non-null  object 
 5   speaker_job_title     7342 non-null   object 
 6   state_info            8030 non-null   object 
 7   party_affiliation     10238 non-null  object 
 8   barely_true_counts    10238 non-null  float64
 9   false_counts          10238 non-null  float64
 10  half_true_counts      10238 non-null  float64
 11  mostly_true_counts    10238 non-null  float64
 12  pants_on_fire_counts  10238 non-null  float64
 13  context               10138 non-null  object 
dtypes: float64(5), object(9)
memory usage: 1.1+ MB



In [97]:
train_label_counts = pd.DataFrame({"label": train_input_labels})["label"].value_counts(normalize=True)
print(train_label_counts.shape[0], "labels\n")
print(train_label_counts)

code_switch = "\"""""
#Balance if necessary
print(f"TRAIN SPLIT(pre-balancing):", X_train.size(0), "overall samples:", X_train.shape)
X_train, y_train = SMOTE(random_state=42).fit_resample(X_train, y_train)
X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)
print()
print(f"TRAIN SPLIT(post-balancing):", X_train.size(0), "overall samples:", X_train.shape)
print(pd.DataFrame({"label": [label_encoder.classes_[y] for y in y_train]})["label"].value_counts())
#"""

2 labels

label
false    0.644727
true     0.355273
Name: proportion, dtype: float64
