In [2]:
import sys
if ".." not in sys.path:
    #sys.path.insert(0, "..")
    sys.path.append('../')
import logging
logging.basicConfig(level='ERROR')
from transformers import logging
logging.set_verbosity_error()
import time
import datetime
import os
import wandb
import wget
import pandas as pd
import numpy as np
import random
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import (AutoModel, AutoTokenizer, AutoModelForSequenceClassification, 
    get_linear_schedule_with_warmup, AutoConfig)
from sklearn import metrics
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
f1_score, cohen_kappa_score, roc_auc_score, confusion_matrix, log_loss,
matthews_corrcoef, average_precision_score)

#Custom modules
import utils
from utils import custom_models, early_stopping, worthiness_checker, constants

In [4]:
os.environ["WANDB_SILENT"] = "true"
os.environ["WANDB_NOTEBOOK_NAME"] = 'TransformersForClaimWorthiness.ipynb'

# Constants
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
parent_dir = os.path.dirname(os.path.abspath(os.getcwd()))
seed_list = [42] 
fold_count = 6
patience=5
loss_function = nn.BCELoss()

metric_types = np.dtype(
    [
        ("mAP", float),
        ("auc", float),
        ("accuracy", float),
        ("precision", float),
        ("recall", float),
        ("f1", float),
        ("mcc", float),
        ("log_loss", float),
        ("loss", float)
    ])

In [3]:
constants = constants.Constants()
constants.device = device
constants.parent_dir = parent_dir
constants.seed_list = seed_list
constants.fold_count = fold_count
constants.patience = patience
constants.loss_function = loss_function
constants.metric_types = metric_types

In [4]:
data_dir = os.path.join(parent_dir, 'Data')

### Training Model

In [5]:
def get_data_from_file(data_version):
    train_df = pd.read_csv(os.path.join(parent_dir, 'Data','train_english_{}.tsv'.format(data_version)), delimiter='\t')
    test_df = pd.read_csv(os.path.join(parent_dir, 'Data','test_english_{}.tsv'.format(data_version)), delimiter='\t')
    return train_df, test_df

In [40]:
def get_fold_list(df: pd.DataFrame, fold_count, random_state):
    # Group positive and negative samples for stratified sampling
    sampling_df = df.sample(frac=1, replace=False, random_state=random_state)
    sampling_negative_df = sampling_df[sampling_df['check_worthiness']==0]
    sampling_positive_df = sampling_df[sampling_df['check_worthiness']==1]

    #determine fold length for both classes
    fold_size_for_negatives = sampling_negative_df.shape[0]//fold_count
    fold_size_for_positives = sampling_positive_df.shape[0]//fold_count

    fold_list = []
    for i in range(fold_count):
        lower_bound_positives = fold_size_for_positives*i
        upper_bound_positives = fold_size_for_positives * (i+1)
        
        lower_bound_negatives = fold_size_for_negatives*i
        upper_bound_negatives = fold_size_for_negatives * (i+1)

        if i+1 == fold_count:
            upper_bound_positives = None
            upper_bound_negatives = None

        fold_for_negatives = sampling_negative_df.iloc[lower_bound_negatives : upper_bound_negatives]
        fold_for_positives = sampling_positive_df.iloc[lower_bound_positives : upper_bound_positives]
        fold_df = pd.concat([fold_for_negatives, fold_for_positives]).sample(frac=1, replace=False, random_state=random_state)

        fold_list.append(fold_df)

    return fold_list

In [7]:


def create_dataset(df, tokenizer, config):
    max_token_length = config.max_token_length

    sentences = df.tweet_text.values
    labels = df.check_worthiness.values
    incices = df.index.values.tolist()
    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []

    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]' or equivalent
                            max_length = max_token_length,           # 64? 4-128? Pad & truncate all sentences.
                            truncation=True,
                            padding = 'max_length',
                            return_attention_mask = False,   # Do not Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                       )

        # Add the encoded sentence to the list.    
        input_ids.append(encoded_dict['input_ids'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    labels = torch.tensor(labels).float()
    incices = torch.tensor(incices).int()
    dataset = TensorDataset(input_ids, labels, incices)
    return dataset

In [8]:
sweep_config = {
    'name': 'Bert_hyperparameters',
    'method': 'bayes', #grid, random
    'program': 'TransformersForClaimWorthiness.ipynb',
    'early_terminate': {
      'type': 'hyperband',
      'eta': 3,
      's': 2,
      'max_iter': 27   
    },
    'metric': {
      'name': 'avg_val_mAP',
      'goal': 'maximize'   
    },
    'parameters': {
        'data_version': {
          'values': ['raw', 'cleaned_with_mentions', 'cleaned_without_mentions'],
          'distribution': 'categorical'  
        },
        'max_token_length': {
           'min': 4,
           'max': 80,
           'distribution': 'int_uniform'
        },
        'model_name': {
          'values': ['bert-base-uncased'],
          'distribution': 'categorical'  
        },
        'hidden_act': {
          'values': ['relu', 'gelu', 'gelu_new', 'silu'],
          'distribution': 'categorical'  
        },
        'position_embedding_type': {
          'values': ['absolute', 'relative_key', 'relative_key_query'],
          'distribution': 'categorical'  
        },
        'attention_dropout': {
            'min': 0.001,
            'max': 0.2
        },
        'transformer_dropout': {
            'min': 0.001,
            'max': 0.2
        },
        'classifier_dropout': {
            'min': 0.001,
            'max': 0.2
        },
        'layer_norm_eps': {
            'min': 1e-14,
            'max': 1e-10
        },
        'batch_size': {
           'min': 2,
           'max': 80,
           'distribution': 'int_uniform'
        },
        'learning_rate': {
            'min': 0.0000005,
            'max': 0.00025
        },
        'epochs':{
           'min': 1,
           'max': 80,
           'distribution': 'int_uniform'
        }
    }
}


check_points = ['vinai/bertweet-covid19-base-uncased', 'roberta-base', 'bert-base-uncased']

In [9]:
sweep_defaults = {
    'data_version': 'cleaned_with_mentions',
    'max_token_length': 46,
    'model_name': 'bert-base-uncased',
    'hidden_act': 'gelu',
    'position_embedding_type': 'absolute',
    'layer_norm_eps': 5.4225686692811365e-11,
    'learning_rate': 0.000028734737822604655,
    'transformer_dropout': 0.03873251195245608,
    'attention_dropout': 0.015328152075297112,
    'classifier_dropout': 0.10850207289443518,
    'batch_size': 53,
    'epochs':25
}

In [10]:
# WANDB PARAMETER
def ret_dataloader(batch_size, train_dataset, validation_dataset):

    train_dataloader = DataLoader(
                train_dataset,  # The training samples.
                sampler = RandomSampler(train_dataset), # Select batches randomly
                batch_size = batch_size # Trains with this batch size.
            )

    validation_dataloader = DataLoader(
                validation_dataset, # The validation samples.
                sampler = SequentialSampler(validation_dataset), # Pull out batches sequentially.
                batch_size = batch_size # Evaluate with this batch size.
            )
    return train_dataloader, validation_dataloader

def ret_optim(model, config):
    #print('Learning_rate = ',wandb.config.learning_rate )
    optimizer = torch.optim.AdamW(model.parameters(),
                      lr = config.learning_rate, 
                      eps = 1e-8 
                    )
    return optimizer
    
def ret_scheduler(train_dataloader,optimizer, config):
    epochs = config.epochs
    total_steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, # Default value in run_glue.py
                                                num_training_steps = total_steps)
    return scheduler

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [11]:
def create_dataloaders(fold_list, fold_index, tokenizer, config):
    temp_list = fold_list.copy()

    trial_validation_df = temp_list.pop(fold_index)
    trial_train_df = pd.concat(temp_list)

    train_dataset = create_dataset(trial_train_df, tokenizer, config)
    validation_dataset = create_dataset(trial_validation_df, tokenizer, config)

    return ret_dataloader(config.batch_size, train_dataset, validation_dataset)

In [12]:

def get_metrics(probability, label_list):
    metrics_dictionary = {}
    # predictions = np.argmax(probability.detach().cpu().numpy(), axis=0)
    predictions =  [int(i > .5) for i in probability]

    accuracy = accuracy_score(label_list, predictions)
    precision = precision_score(label_list, predictions, zero_division=0)
    recall = recall_score(label_list, predictions, zero_division=0)
    f1 = f1_score(label_list, predictions, zero_division=0)
    log_loss = metrics.log_loss(label_list, predictions)
    mcc = matthews_corrcoef(label_list, predictions)
    auc = roc_auc_score(label_list, probability)
    
    mAP = average_precision_score(label_list, probability)

    metric_df = pd.DataFrame(np.empty(0, dtype=metric_types))
    metric_df.loc[0] = [mAP, auc, accuracy, precision, recall, f1, mcc, log_loss, 0]

    return metric_df

In [13]:
def log_metrics(metric_df, prefix):
    wandb.log({prefix + 'mAP':metric_df.loc[0, 'mAP'],
            prefix + 'auc':metric_df.loc[0, 'auc'],
            prefix + 'accuracy':metric_df.loc[0, 'accuracy'],
            prefix + 'precision':metric_df.loc[0, 'precision'],
            prefix + 'recall':metric_df.loc[0, 'recall'],
            prefix + 'f1':metric_df.loc[0, 'f1'],
            prefix + 'mcc':metric_df.loc[0, 'mcc'],
            prefix + 'log_loss':metric_df.loc[0, 'log_loss'],
            prefix + 'eval_loss':metric_df.loc[0, 'loss']
            })

In [14]:
def evaluate_one_epoch(model, device, dataloader):
        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables 
        total_eval_loss = 0
        nb_eval_steps = 0

        probability_list = torch.Tensor(0)
        label_list = np.empty(0)

        # Evaluate data for one epoch
        for batch in dataloader:
            
            b_input_ids = batch[0].to(device)
            b_labels = batch[1].to(device)

            with torch.no_grad():        
                probability = model(b_input_ids).flatten()
                loss = loss_function(probability, b_labels)

            # Accumulate the validation loss, probability and labels.
            total_eval_loss += loss.item()
            probability_list = torch.cat((probability_list, probability.detach().cpu()), axis=0)
            label_list = np.concatenate((label_list, batch[1]), axis=0)

            # Move probability and labels to CPU
            probability = probability.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # del b_input_ids
            # del b_labels

        # Calculate and log metrics and loss.
        metrics_df = get_metrics(probability_list, label_list)
        metrics_df.loc[0,'loss'] = total_eval_loss / len(dataloader)

        return metrics_df

In [15]:
def train_one_epoch(model, device, dataloader, loss_function, optimizer, scheduler):
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(dataloader):
        b_input_ids = batch[0].to(device)
        b_labels = batch[1].to(device)

        model.zero_grad()

        probability = model(b_input_ids)
        loss = loss_function(probability.flatten(), b_labels)
        total_train_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        # del b_input_ids
        # del b_labels

    return total_train_loss / len(dataloader)

In [16]:

def cross_validate():
    # clean gpu memory in any case if previous wandb run was crashed.
    torch.cuda.empty_cache()
    run = wandb.init(config=sweep_defaults)
    run_start_time = time.time()
    # print(wandb.config.items())
    epochs = wandb.config.epochs

    model_name = wandb.config.model_name
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

    train_df, test_df = get_data_from_file(wandb.config.data_version)

    run_train_metrics_list = []
    run_val_metrics_list = []

    for seed_index, seed in enumerate(seed_list):

        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        fold_list = get_fold_list(train_df, 3, random_state=seed)



        for fold_index in range(len(fold_list)): 

            train_dataloader, validation_dataloader = create_dataloaders(fold_list, fold_index, tokenizer, wandb.config)
            model = custom_models.TransformerClassifier(wandb.config).to(device)

            optimizer = ret_optim(model, wandb.config)
            scheduler = ret_scheduler(train_dataloader, optimizer, wandb.config)

            # Creating class that checks early stopping condition
            early_stopping = utils.early_stopping.EarlyStopping(patience=patience)

            epoch_train_metrics_list = []
            epoch_val_metrics_list = []

            for epoch_i in range(0, epochs):

                # ========================================
                #               Training
                # ========================================

                # print("")
                print('======== Seed {:} - Fold {:} - Epoch {:} / {:} ========'.format(seed_index+1, fold_index+1, epoch_i + 1, epochs))
                # print('Training...')
                training_start_time = time.time()

                epoch_train_loss = train_one_epoch(model, device, train_dataloader, loss_function, optimizer, scheduler)

                training_time = format_time(time.time() - training_start_time)
                wandb.log({'train_loss_':epoch_train_loss})

                # print("  Average training loss: {0:.2f}".format(epoch_train_loss))
                #print("  Training epcoh took: {:}".format(training_time))

                # ========================================
                #               Evaluation
                # ========================================

                # print("Running Evaluation...")
                evaluation_start_time = time.time()

                epoch_train_metrics = evaluate_one_epoch(model, device, train_dataloader)
                log_metrics(epoch_train_metrics, 'train_')
                epoch_train_metrics_list.append(epoch_train_metrics)

                epoch_val_metrics = evaluate_one_epoch(model, device, validation_dataloader)
                log_metrics(epoch_val_metrics, 'val_')
                epoch_val_metrics_list.append(epoch_val_metrics)

                val_mAP = epoch_val_metrics['mAP'].loc[0]
                train_mAP = epoch_train_metrics['mAP'].loc[0]

                print("  Training mAP: {:.3f} - Validation mAP: {:.3f}".format(train_mAP,val_mAP ))

                evaluation_time = format_time(time.time() - evaluation_start_time)
                #print("  Evaluation took: {:}".format(evaluation_time))
                if early_stopping.should_stop(val_mAP):
                    # print('terminating because of early stopping!')
                    wandb.log({'early_stopped_at': seed_index*len(fold_list)*epochs+fold_index*epochs+epoch_i + 1})
                    break

            # at the end of each fold, after every epoch finished, 
            # get last epoch`s metrics as final metrics of current fold
            #  # for  df in epoch_val_metrics_list:
            #  #     log_metrics(epoch_val_metrics, 'val_{:}_{:}_'.format(seed_index+1, fold_index+1))

            fold_train_metrics = epoch_train_metrics_list[-1]
            fold_val_metrics = epoch_val_metrics_list[-1]

            run_train_metrics_list.append(fold_train_metrics)
            run_val_metrics_list.append(fold_val_metrics)

            # at the end of every fold, 
            # calculate average metrics as final metrics of current run
            run_train_metrics = pd.concat(run_train_metrics_list)
            run_val_metrics = pd.concat(run_val_metrics_list)

            log_metrics(pd.DataFrame([run_train_metrics.mean()]), 'avg_train_')
            log_metrics(pd.DataFrame([run_val_metrics.mean()]), 'avg_val_')

            # del model
            # torch.cuda.empty_cache()

    # print("")
    # print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-run_start_time)))



In [None]:
# cross_validate()

In [17]:
project="Transformers_For_ClaimWorthiness"
entity="cemulu"

In [18]:
# sweep_id = wandb.sweep(sweep_config, project=project)
sweep_id = 'nbovtee3'

In [None]:
wandb.agent(sweep_id, project=project,function=cross_validate, count=1)

### Training Model with best config and whole training data

In [78]:
api = wandb.Api()
# best_sweep = 'nbovtee3' #bert
# best_sweep = 'embywnlj' #roberta
best_sweep = '2afv0m0i' #bertweet
sweep = api.sweep("cemulu/Transformers_For_ClaimWorthiness/" + best_sweep)

In [79]:
best_run = sweep.best_run()
best_run.summary.get("avg_val_mAP")

[34m[1mwandb[0m: Sorting runs by -summary_metrics.avg_val_mAP


0.7651173954688729

In [19]:
worthiness_checker = utils.worthiness_checker.WorthinessChecker(best_run, constants)

Epoch configuration of the best run:
36
Early stopped at:
                   36    66    123    156    192    240
fold_index         1.0   2.0   3.0    4.0    5.0    6.0
cumulative_epoch  12.0  45.0  90.0  118.0  155.0  195.0
epoch_of_fold     12.0   9.0  18.0   10.0   11.0   15.0

Average epoch used as a reference for early stopping:  8


In [None]:
# optimized_model = worthiness_checker.train_full_model()

# model_file_name = 'bert-base-uncased_0.7332254639950794.pt' # bert
# model_file_name = 'roberta-base_0.7551132534277406.pt' # roberta
model_file_name = 'vinai_bertweet-covid19-base-uncased_0.7651173954688729.pt' # bertweet
PATH = os.path.join(parent_dir, 'Model', model_file_name)
worthiness_checker.load_model(PATH)

In [None]:
model_name = worthiness_checker.config.model_name.replace('/','_')
mAP = best_run.summary.get("avg_val_mAP")

PATH = os.path.join(parent_dir, 'Model','{}_{}.pt'.format(model_name, mAP))
# torch.save(optimized_model.state_dict(), PATH)

### Single Tests

In [None]:
tweet = "UK Health Minister Nadine Dorries has tested positive for COVID-19."

In [None]:
tweet = "i am positive"

In [None]:
tweet = "sheep is black"

In [None]:
tweet = "Recent research suggests that 15 percent of abortions are the result of coercion."

In [None]:
tweet = '''A Democratic bill negotiated between Sens. Joe Manchin and Chuck Schumer would "increase taxes on millions of Americans across every income bracket."'''


In [None]:
tweet = '''Nancy Pelosi and Democrats "want to turn 150 million Americans into felons overnight" with HR 1808.'''

In [None]:
tweet = '''John Fetterman wants to “eliminate life sentences for murderers."'''

In [None]:
tweet = '''"The Sun is out of place, the Moon is out of place and the stars are out of place. The compasses are off" because of a shift in the Earth’s poles.'''

In [None]:
tweet = "China threatens to shoot Nancy Pelosi’s plane down if she visits Taiwan."

In [None]:
tweet = "In Virginia, we actually do protect same-sex marriage."

In [None]:
probability = worthiness_checker.predict(tweet)

In [None]:
probability

### Batch Testing

In [84]:
# _, test_df = worthiness_checker.get_data_from_file(worthiness_checker.config.data_version)
_, test_df = worthiness_checker.get_data_from_file()

In [85]:
test_dataset = worthiness_checker.create_dataset(test_df)
_, test_dataloader = ret_dataloader(worthiness_checker.config.batch_size, _, test_dataset)

In [86]:
def evaluate_one_batch(batch, self):
    b_input_ids = batch[0].to(self.constants.device)
    b_labels = batch[1].to(self.constants.device)
    with torch.no_grad():        
        probability = self.model(b_input_ids).flatten()
        loss = self.constants.loss_function(probability, b_labels)
    probability = probability.detach().cpu()
    return probability, loss

In [87]:
probability_list = torch.Tensor(0)
for batch in test_dataloader:
    probability, _ = evaluate_one_batch( batch, worthiness_checker)
    probability_list = torch.cat((probability_list, probability), axis=0)


In [88]:
len(probability_list)

140

In [89]:
predictions =  [int(i > .5) for i in probability_list]


In [90]:
len(predictions)

140

In [92]:
# model_prefix = 'bert_'
# model_prefix = 'roberta_'
# model_prefix = 'bertweet_'
model_prefix = 'bertweet_filtered_'

In [93]:
# eval_df = test_df.copy()
eval_df = pd.read_csv(os.path.join(data_dir,"eval_df.csv"))

In [94]:
eval_df[model_prefix + 'predictions'] = predictions
eval_df[model_prefix + 'probability'] = probability_list

In [None]:
eval_df.insert(0, 'tweet_url', test_df["tweet_url"])

In [None]:
eval_df.insert(0, 'tweet_id', test_df["tweet_id"])

In [98]:
eval_df.to_csv("eval_df.csv", encoding='utf-8', index=False)

False Positives:

* (output = 0.986627) the number of COVID-19 cases in the US surpasses 1,000 with 1,004 people in 37 states and DC testing positive for COVID-19, plus 31 deaths. This is just beginning the acceleration phase of the Cor...

* (output = 0.987125) Italy's Prime Minister Giuseppe Conte has announced that the whole of the country is being put on lockdown in an attempt to contain the COVID-19 outbreak. For the latest on COVID-19, click here:

***

* (output = 0.986261) The empire is striking back. The COVID-19 is now being used as a weapon to destabilize the US economy because that the powers that be feel that’s the only way they can get rid of Trump and regain ...

* (output = 0.857124) As two epidemics - COVID-19 and Brexit - hit us, don’t let them make you forget: Priti Patel scandal and investigations of Johnson’s Arcuri Russia report referendum crimes lies incompetence etc et...

False Negatives

* (output = 0.006674) Democrats and the Media need to stop using the COVID-19 to politicize things and scare people. It's irresponsible. This is not the time to try and gain political points or headlines from scaring p...

* (output = 0.006982) คำขวัญ Thailand 2020 No privacy No security No democracy No hope No future No mask But we had COVID-19 and stupid government Thank you ธนาธร ไม่เอารัฐประหาร

***

* (output = 0.006344) Italian doctor facing COVID-19 tsunami publishes long, moving thread that culminates with, ‘Is panic really worse than neglect and carelessness during an epidemic of this sort?’ Read the whole fri...

* (output = 0.006693) This thread needs to fly. It shows how the legacy media is USING COVID-19 as a political weapon and even how the SAME reporters are contradicting themselves. This. Is. SICK.

Ommitted Link: THREAD: Fri Jan 31, 2020, a few weeks before #Coronavirus has officially spread to other countries (which led to the bad stock market week Feb 24-Feb 28), the Trump Admin announced travel restrictions on China. Here is some of the reporting it generated. Take Politico of 2/4/20.

Tweet: ""private ny colleges: *closed college for the week because of the COVID-19* CUNY: *installed two new hand sanitizer dispensers*""
Label: 0
Prediction: 1 with 98.75% probability

Zero-shot GPT-3 response: "The sentiment of this tweet is that private colleges are not doing enough to prevent the spread of COVID-19, while CUNY is taking steps to protect its students. This is not a check-worthy claim."

In [None]:
test_df[test_df['predictions'] != test_df['check_worthiness']]

### Cross-validated predictions for labeling error detection

In [None]:
def cross_validated_predictions(config, k_fold, seed):
    run_start_time = time.time()
    epochs = config.epochs

    model_name = config.model_name
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

    train_df, test_df = get_data_from_file(config.data_version)

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    fold_list = get_fold_list(train_df, k_fold, random_state=seed)

    probability_list = torch.Tensor(0)
    label_list = np.empty(0)
    index_list = np.empty(0, dtype=np.int32)

    for fold_index in range(len(fold_list)): 

        train_dataloader, validation_dataloader = create_dataloaders(fold_list, fold_index, tokenizer, config)
        model = custom_models.TransformerClassifier(config).to(device)

        optimizer = ret_optim(model, config)
        scheduler = ret_scheduler(train_dataloader, optimizer, config)

        for epoch_i in range(0, epochs):

            # Training
            print('======== Fold {:} - Epoch {:} / {:} ========'.format(fold_index+1, epoch_i + 1, epochs))
            epoch_train_loss = train_one_epoch(model, device, train_dataloader, loss_function, optimizer, scheduler)
            print("  Average training loss: {0:.2f}".format(epoch_train_loss))

        # Evaluation

        model.eval()

        # Tracking variables 
        total_eval_loss = 0
        nb_eval_steps = 0

        # Evaluate data for fold
        for batch in validation_dataloader:

            b_input_ids = batch[0].to(device)
            b_labels = batch[1].to(device)

            with torch.no_grad():        
                probability = model(b_input_ids).flatten()
                loss = loss_function(probability, b_labels)

            # Accumulate the validation loss, probability and labels.
            total_eval_loss += loss.item()
            probability_list = torch.cat((probability_list, probability.detach().cpu()), axis=0)
            label_list = np.concatenate((label_list, batch[1]), axis=0)
            index_list = np.concatenate((index_list, batch[2]), axis=0)

            # Move probability and labels to CPU
            probability = probability.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

        avg_eval_loss = total_eval_loss / len(validation_dataloader)

        print(" Validation loss: {:.3f}".format(avg_eval_loss))

    return index_list, probability_list

In [None]:
index_list, probability_list = cross_validated_predictions(worthiness_checker.config, 10, 42)

In [66]:
worthiness_checker.config.data_version

'cleaned_without_mentions'

In [54]:
predictions =  [int(i > .5) for i in probability_list]

In [56]:
prob_df = pd.DataFrame({'bertweet_predictions': predictions, 'bertweet_probability':probability_list.numpy()}, index_list)

In [58]:
ordered_df = prob_df.sort_index()

In [67]:
train_df = pd.read_csv(os.path.join(parent_dir, 'Data','train_english_cleaned_without_mentions.tsv'), delimiter='\t')

In [74]:
predicted_train_df = pd.concat([train_df, ordered_df], axis=1)

In [76]:
predicted_train_df.to_csv("train_predictions_df.csv", encoding='utf-8', index=False)

In [75]:
predicted_train_df

Unnamed: 0,tweet_text,check_worthiness,bertweet_predictions,bertweet_probability
0,Since this will never get reported by the medi...,1,1,0.635311
1,"Thanks, MichaelBloomberg. Here’s a handy littl...",0,0,0.010500
2,"Folks, when you say ""The COVID-19 isn't a big ...",0,0,0.024375
3,Just 1 case of COVID-19 in India and people ar...,0,1,0.849137
4,President made a commitment to donate his sala...,1,0,0.256311
...,...,...,...,...
817,Stop spreading fake news COVID-19,0,0,0.102101
818,It's fake! It's fake!' shout residents of a co...,1,1,0.983913
819,Be Smart about COVID-19: 1⃣ follow accurate pu...,0,0,0.006635
820,"On the left: , a Qatari puppet, attacks Saudi ...",1,1,0.988063


### Model Training with Filtered Data

In [80]:
worthiness_checker = utils.worthiness_checker.WorthinessChecker(best_run, constants)

Epoch configuration of the best run:
36
Early stopped at:
                   36    66    123    156    192    240
fold_index         1.0   2.0   3.0    4.0    5.0    6.0
cumulative_epoch  12.0  45.0  90.0  118.0  155.0  195.0
epoch_of_fold     12.0   9.0  18.0   10.0   11.0   15.0

Average epoch used as a reference for early stopping:  8


In [81]:
# Setting configuration to use filtered data
worthiness_checker.config.data_version = "filtered"

In [83]:
worthiness_checker.train_full_model()

  Training mAP: 0.843 - Test mAP: 0.570
  Training mAP: 0.946 - Test mAP: 0.703
  Training mAP: 0.952 - Test mAP: 0.675
  Training mAP: 0.993 - Test mAP: 0.704
  Training mAP: 1.000 - Test mAP: 0.726
  Training mAP: 1.000 - Test mAP: 0.725
  Training mAP: 1.000 - Test mAP: 0.726
  Training mAP: 1.000 - Test mAP: 0.726
*** Training Metrics ***
        mAP       auc  accuracy  precision    recall        f1       mcc  \
0  0.842641  0.880390  0.852239   0.861635  0.640187  0.734584  0.648722   
0  0.946384  0.966818  0.916418   0.865741  0.873832  0.869767  0.808250   
0  0.951621  0.971358  0.904478   0.962963  0.728972  0.829787  0.779471   
0  0.993373  0.996874  0.970149   0.957547  0.948598  0.953052  0.931193   
0  0.999578  0.999805  0.994030   0.990654  0.990654  0.990654  0.986268   
0  0.999724  0.999867  0.992537   0.986047  0.990654  0.988345  0.982862   
0  1.000000  1.000000  1.000000   1.000000  1.000000  1.000000  1.000000   
0  1.000000  1.000000  1.000000   1.000000  1.0

TransformerClassifier(
  (transformer): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=3.5894066339034747e-11, elementwise_affine=True)
      (dropout): Dropout(p=0.03312463661668133, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0701542558710376, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_featu

### Creating Contextual Embeddings

In [None]:
embeddings_list = []
for index, row in eval_df.iterrows():
    tweet_embedding = worthiness_checker.get_embedding(tweet)
    tweet_embedding = tweet_embedding.detach().cpu().numpy().tolist()
    embeddings_list.append(tweet_embedding)

In [None]:
eval_df["embeddings"] = embeddings_list

In [None]:
eval_df.to_csv("eval_df.csv", index=False)