## In this notebook, we attempted to finetune two large language models (miniLM, RoBERTa) to classify transcript chunks as right wing or left wing. 

- Author: Bowen Yi

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 
device = 1
os.environ["CUDA_VISIBLE_DEVICES"]= str(device)

In [2]:
import pandas as pd
import os
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from collections import Counter

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [2]:
df = pd.read_csv("07_Data_annotated_transcript.csv")

In [3]:
df.head(1)

Unnamed: 0,transcript_to_chunk,path,url,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,Annotator Assigned,Human Label,transcript_to_annotate
0,['But I am not going to sit here and do that. ...,/api.substack.com/ef/httpsapi.substack.comfeed...,https://api.substack.com/feed/podcast/80790471...,news,politics,commentary,,,,,,,,Alan,0,But I am not going to sit here and do that. An...


In [7]:
df[df['Human Label']==1].shape

(407, 16)

In [8]:
df[df['Human Label']==0].shape

(506, 16)

In [15]:
df[df['transcript_to_chunk'].isna()].shape

(34, 16)

In [17]:
df[~df['transcript_to_chunk'].isna()].shape

(879, 16)

In [18]:
df = df[~df['transcript_to_chunk'].isna()]

In [19]:
df.shape

(879, 16)

## 1. Split annotated transcripts to train, dev, and test sets (7:1:2)

In [20]:
df_train, df_dev_test = train_test_split(df, test_size=0.3, random_state=1)

In [21]:
df_dev, df_test = train_test_split(df_dev_test, test_size=2/3, random_state=1)

## 2. Chunk transcript

In [22]:
def chunk_transcript(text, chunk_size=100, max_size=256):
    chunks = []
    current_chunk = ''
    current_words_count = 0 

    for sentence in text:
        words_in_sentence = len(sentence.split())
        
        if current_words_count + words_in_sentence > chunk_size:
            chunks.append(current_chunk)
            current_chunk = sentence
            current_words_count = words_in_sentence
        else:
            current_chunk += sentence
            current_words_count += words_in_sentence

    if current_chunk.strip():
        if chunks and len(chunks[-1].split()) + len(current_chunk.split()) <= max_size:
               chunks[-1] += '' + current_chunk
        else:
               chunks.append(current_chunk.strip())

    return chunks



In [23]:
df.columns

Index(['transcript_to_chunk', 'path', 'url', 'cat1', 'cat2', 'cat3', 'cat4',
       'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'Annotator Assigned',
       'Human Label', 'transcript_to_annotate'],
      dtype='object')

### 2.1 Chunk train set

In [24]:
train_texts = []
train_labels = []
train_paths = []
train_chunk_ind = []
train_cat1 = []
train_cat2 = []
train_cat3 = []
train_cat4 = []
train_cat5 = []
train_cat6 = []
train_cat7 = []
train_cat8 = []
train_cat9 = []
train_cat10 = []

for _, row in df_train.iterrows():
    chunks = chunk_transcript(row['transcript_to_chunk'])
    chunk_count = len(chunks)
    
    if chunks:
        train_texts.extend(chunks)
        train_labels.extend([row["Human Label"]] * chunk_count)
        train_paths.extend([row["path"]] * chunk_count)

        
        chunk_ind = list(range(1, chunk_count+1))
        train_chunk_ind.extend(chunk_ind)

        train_cat1.extend([row["cat1"]] * chunk_count)
        train_cat2.extend([row["cat2"]] * chunk_count)
        train_cat3.extend([row["cat3"]] * chunk_count)
        train_cat4.extend([row["cat4"]] * chunk_count)
        train_cat5.extend([row["cat5"]] * chunk_count)
        train_cat6.extend([row["cat6"]] * chunk_count)
        train_cat7.extend([row["cat7"]] * chunk_count)
        train_cat8.extend([row["cat8"]] * chunk_count)
        train_cat9.extend([row["cat9"]] * chunk_count)
        train_cat10.extend([row["cat10"]] * chunk_count)
        

In [25]:
train_data = {
    'text':train_texts,
    'label':train_labels,
    'path':train_paths,
    'chunk_pos':train_chunk_ind,
    'cat1':train_cat1,
    'cat2':train_cat2,
    'cat3':train_cat3,
    'cat4':train_cat4,
    'cat5':train_cat5,
    'cat6':train_cat6,
    'cat7':train_cat7,
    'cat8':train_cat8,
    'cat9':train_cat9,
    'cat10':train_cat10,
}

df_chunks_train = pd.DataFrame(train_data)
df_chunks_train = df_chunks_train[df_chunks_train['text'].str.strip() != '']
df_chunks_train = df_chunks_train.reset_index(drop=True)
df_chunks_train = df_chunks_train.sample(frac=1, random_state=1).reset_index(drop=True)



In [31]:
df_chunks_train.shape

(124553, 14)

### 2.2 Chunk dev

In [27]:
dev_texts = []
dev_labels = []
dev_paths = []
dev_chunk_ind = []
dev_cat1 = []
dev_cat2 = []
dev_cat3 = []
dev_cat4 = []
dev_cat5 = []
dev_cat6 = []
dev_cat7 = []
dev_cat8 = []
dev_cat9 = []
dev_cat10 = []

for _, row in df_dev.iterrows():
    chunks = chunk_transcript(row['transcript_to_chunk'])
    chunk_count = len(chunks)
    
    if chunks:
        dev_texts.extend(chunks)
        dev_labels.extend([row["Human Label"]] * chunk_count)
        dev_paths.extend([row["path"]] * chunk_count)
        
        chunk_ind = list(range(1, chunk_count+1))
        dev_chunk_ind.extend(chunk_ind)

        dev_cat1.extend([row["cat1"]] * chunk_count)
        dev_cat2.extend([row["cat2"]] * chunk_count)
        dev_cat3.extend([row["cat3"]] * chunk_count)
        dev_cat4.extend([row["cat4"]] * chunk_count)
        dev_cat5.extend([row["cat5"]] * chunk_count)
        dev_cat6.extend([row["cat6"]] * chunk_count)
        dev_cat7.extend([row["cat7"]] * chunk_count)
        dev_cat8.extend([row["cat8"]] * chunk_count)
        dev_cat9.extend([row["cat9"]] * chunk_count)
        dev_cat10.extend([row["cat10"]] * chunk_count)


In [29]:
dev_data = {
    'text':dev_texts,
    'label':dev_labels,
    'path':dev_paths,
    'chunk_pos':dev_chunk_ind,
    'cat1':dev_cat1,
    'cat2':dev_cat2,
    'cat3':dev_cat3,
    'cat4':dev_cat4,
    'cat5':dev_cat5,
    'cat6':dev_cat6,
    'cat7':dev_cat7,
    'cat8':dev_cat8,
    'cat9':dev_cat9,
    'cat10':dev_cat10,
}

df_chunks_dev = pd.DataFrame(dev_data)
df_chunks_dev = df_chunks_dev[df_chunks_dev['text'].str.strip() != '']
df_chunks_dev = df_chunks_dev.reset_index(drop=True)
df_chunks_dev = df_chunks_dev.sample(frac=1, random_state=1).reset_index(drop=True)



In [30]:
df_chunks_dev.shape

(16789, 14)

### 2.3 Chunk test

In [33]:
test_texts = []
test_labels = []
test_paths = []
test_chunk_ind = []
test_cat1 = []
test_cat2 = []
test_cat3 = []
test_cat4 = []
test_cat5 = []
test_cat6 = []
test_cat7 = []
test_cat8 = []
test_cat9 = []
test_cat10 = []

for _, row in df_test.iterrows():
    chunks = chunk_transcript(row['transcript_to_chunk'])
    chunk_count = len(chunks)
    
    if chunks:
        test_texts.extend(chunks)
        test_labels.extend([row["Human Label"]] * chunk_count)
        test_paths.extend([row["path"]] * chunk_count)
        
        chunk_ind = list(range(1, chunk_count+1))
        test_chunk_ind.extend(chunk_ind)

        test_cat1.extend([row["cat1"]] * chunk_count)
        test_cat2.extend([row["cat2"]] * chunk_count)
        test_cat3.extend([row["cat3"]] * chunk_count)
        test_cat4.extend([row["cat4"]] * chunk_count)
        test_cat5.extend([row["cat5"]] * chunk_count)
        test_cat6.extend([row["cat6"]] * chunk_count)
        test_cat7.extend([row["cat7"]] * chunk_count)
        test_cat8.extend([row["cat8"]] * chunk_count)
        test_cat9.extend([row["cat9"]] * chunk_count)
        test_cat10.extend([row["cat10"]] * chunk_count)


In [34]:
test_data = {
    'text':test_texts,
    'label':test_labels,
    'path':test_paths,
    'chunk_pos':test_chunk_ind,
    'cat1':test_cat1,
    'cat2':test_cat2,
    'cat3':test_cat3,
    'cat4':test_cat4,
    'cat5':test_cat5,
    'cat6':test_cat6,
    'cat7':test_cat7,
    'cat8':test_cat8,
    'cat9':test_cat9,
    'cat10':test_cat10,
}

df_chunks_test = pd.DataFrame(test_data)
df_chunks_test = df_chunks_test[df_chunks_test['text'].str.strip() != '']
df_chunks_test = df_chunks_test.reset_index(drop=True)
df_chunks_test = df_chunks_test.sample(frac=1, random_state=1).reset_index(drop=True)



In [35]:
df_chunks_test.shape

(33690, 14)

In [48]:
df_chunks_test.to_csv("df_chunks_test.csv", index=False)
df_chunks_dev.to_csv("df_chunks_dev.csv", index=False)
df_chunks_train.to_csv('df_chunks_train.csv', index=False)

In [4]:
# df_chunks_train = pd.read_csv('df_chunks_train.csv')
# df_chunks_dev = pd.read_csv('df_chunks_dev.csv')
# df_chunks_test = pd.read_csv('df_chunks_test.csv')

# 3. Fine tune a model on labeled chunks

In [3]:
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer, set_seed 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score


In [4]:
ds_train = Dataset.from_pandas(df_chunks_train[['text', 'label']])
ds_dev = Dataset.from_pandas(df_chunks_dev[['text', 'label']])
ds_test = Dataset.from_pandas(df_chunks_test[['text', 'label']])


In [5]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=512, padding="max_length", truncation=True)
ds_train = ds_train.map(tokenize_function, batched=True, batch_size=512)
ds_dev = ds_dev.map(tokenize_function, batched=True, batch_size=512)
ds_test = ds_test.map(tokenize_function, batched=True, batch_size=512)


Map:   0%|          | 0/124553 [00:00<?, ? examples/s]

Map:   0%|          | 0/16789 [00:00<?, ? examples/s]

Map:   0%|          | 0/33690 [00:00<?, ? examples/s]

In [6]:
from transformers.integrations import WandbCallback
os.environ["WANDB_PROJECT"]="miniLM_politics"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

model = AutoModelForSequenceClassification.from_pretrained("microsoft/MiniLM-L12-H384-uncased", num_labels=2)

output_dir = "/shared/3/projects/bowenyi/where-is-news/595_486_Final_Project/model_output_miniLM"   
seed = 1

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    conf_matrix = confusion_matrix(y_true=labels, y_pred=predictions)
    TN, FP, FN, TP = conf_matrix.ravel()
    
    TPR = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    FPR = FP / (FP + TN) if (FP + TN) > 0 else 0.0
    TNR = TN / (TN + FP) if (TN + FP) > 0 else 0.0
    FNR = FN / (TP + FN) if (TP + FN) > 0 else 0.0

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    accuracy = accuracy_score(labels, predictions)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'true_negative_rate': TNR,
        'false_positive_rate': FPR,
        'false_negative_rate': FNR,
        'true_positive_rate': TPR
    }


In [9]:
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=8,  
    weight_decay=0.01,
    do_train=True,
    do_eval=True,
    seed=seed,
    save_strategy='steps',
    save_steps=500,
    evaluation_strategy='steps',
    eval_steps=500,
    logging_dir=output_dir + 'logs/',
    logging_strategy='steps',
    logging_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    run_name='podcasts-study' + str(seed),
    report_to="wandb"
)


In [10]:
trainer = Trainer(
    model=model,    
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_dev,    
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.add_callback(WandbCallback())
trainer.train()


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback
NotebookProgressCallback
    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
[34m[1mwandb[0m: Currently logged in as: [33mbowenyi[0m ([33mblablablab-nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,True Negative Rate,False Positive Rate,False Negative Rate,True Positive Rate
500,0.6804,0.704762,0.504378,0.46444,0.47287,0.456305,0.547174,0.452826,0.543695,0.456305
1000,0.6632,0.718138,0.513134,0.410415,0.477589,0.359808,0.649628,0.350372,0.640192,0.359808


    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
[34m[1mwandb[0m: Adding directory to artifact (/shared/3/projects/bowenyi/where-is-news/595_486_Final_Project/model_output_miniLM/checkpoint-500)... Done. 7.3s
[34m[1mwandb[0m: Adding directory to artifact (/shared/3/projects/bowenyi/where-is-news/595_486_Final_Project/model_output_miniLM/checkpoint-500)... Done. 13.7s
    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
[34m[1mwandb[0m: Adding directory to artifact (/shared/3/projects/bowenyi/where-is-news/595_486_Final_Project/model_output_miniL

In [12]:
test_result = trainer.evaluate(ds_test)
print(test_result)
wandb.finish()


    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


{'eval_loss': 0.8330050110816956, 'eval_accuracy': 0.5636390620362125, 'eval_f1': 0.450532610726967, 'eval_precision': 0.39913907284768213, 'eval_recall': 0.5171171171171172, 'eval_true_negative_rate': 0.5882459723167688, 'eval_false_positive_rate': 0.4117540276832312, 'eval_false_negative_rate': 0.4828828828828829, 'eval_true_positive_rate': 0.5171171171171172, 'eval_runtime': 105.1079, 'eval_samples_per_second': 320.528, 'eval_steps_per_second': 1.674, 'epoch': 8.0}


NameError: name 'wandb' is not defined

### We noticed that training model on transcript chunk isn't good enough. The highest F-1 score is not even better than chance, which indicates a weak training signal. Considering that we annotate instances at transcript level but train and evaluate at chunk level, the model performance makes sense. 

## 4. Train miniLM with Reddit data and transcript chunks
- Reddit data are better labeled than transcript chunks. We hope this can make our training signal stronger. 
- Reddit data source: https://www.kaggle.com/datasets/neelgajare/liberals-vs-conservatives-on-reddit-13000-posts

In [5]:
df_red = pd.read_csv('08_reddit.csv')

In [6]:
df_red.shape

(12854, 9)

In [8]:
df_red.head(1)

Unnamed: 0,Title,Political Lean,Score,Id,Subreddit,URL,Num of Comments,Text,Date Created
0,"No matter who someone is, how they look like, what language they speak, what they wear, remember the human. For the sake of humanity, the working class can and must unite across all arbitrary boundaries.",Liberal,1,t5fybt,socialism,https://v.redd.it/ng5fyl7hp2l81,0,,1646272000.0


### This is the best external dataset we can find to augment our weakly labeled dataset. We treat Liberal as left wing (0), and Conservative as right wing (1). Although Liberal isn't equal to leftist, we manually examined the dataset and found their definition is pretty close (in this dataset). 

In [9]:
df_red.columns

Index(['Title', 'Political Lean', 'Score', 'Id', 'Subreddit', 'URL',
       'Num of Comments', 'Text', 'Date Created'],
      dtype='object')

In [10]:
red_text = df_red.Title.to_list()

In [11]:
red_label = df_red['Political Lean'].to_list()

In [12]:
red_labels = [1 if lean == 'Conservative' else 0 for lean in red_label]

In [13]:
del df_red

In [14]:
df_red = pd.DataFrame({'text':red_text, 'label':red_labels})

In [15]:
df_red.head(1)

Unnamed: 0,text,label
0,"No matter who someone is, how they look like, what language they speak, what they wear, remember the human. For the sake of humanity, the working class can and must unite across all arbitrary boundaries.",0


In [16]:
df_red.shape

(12854, 2)

## To further balance the impact of weakly labeled chunks, we downsample the dataset of chunks such that it has around the same amount as Reddit Data. Also, we downsampled the number of left-wing chunks to make the class more balanced 

In [31]:
df_chunks = pd.concat([df_chunks_train, df_chunks_dev, df_chunks_test])


In [32]:
df_chunks.shape

(175032, 14)

In [33]:
df_chunks = df_chunks.sample(frac=0.15, random_state=1).reset_index(drop=True)

In [34]:
df_chunks.shape

(26255, 14)

In [35]:
df_chunks = pd.concat([df_chunks, df_red])

In [36]:
df_chunks[df_chunks['label']==1].shape

(15905, 14)

In [37]:
df_chunks[df_chunks['label']==0].shape

(23204, 14)

In [38]:
df_left = df_chunks[df_chunks['label']==0].sample(n=16000, random_state=1).reset_index(drop=True)


In [40]:
df_left.shape

(16000, 14)

In [39]:
df_right = df_chunks[df_chunks['label']==1].reset_index(drop=True)


In [41]:
df_right.shape

(15905, 14)

In [42]:
df_chunks = pd.concat([df_left, df_right])

In [43]:
df_chunks.shape

(31905, 14)

In [44]:
df_train, df_dev_test = train_test_split(df_chunks, test_size=0.3, random_state=1)

In [45]:
df_dev, df_test = train_test_split(df_dev_test, test_size=2/3, random_state=1)

## 4.1 Finetune a model on downsampled labeled chunks and all Reddit data
- We tried a more powerful model: RoBERTa-base

In [46]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, AutoConfig


In [47]:
ds_train = Dataset.from_pandas(df_train[['text', 'label']])
ds_dev = Dataset.from_pandas(df_dev[['text', 'label']])
ds_test = Dataset.from_pandas(df_test[['text', 'label']])


In [48]:
model_id = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=256, padding="max_length", truncation=True)
ds_train = ds_train.map(tokenize_function, batched=True, batch_size=512)
ds_dev = ds_dev.map(tokenize_function, batched=True, batch_size=512)
ds_test = ds_test.map(tokenize_function, batched=True, batch_size=512)

Map:   0%|          | 0/22333 [00:00<?, ? examples/s]

Map:   0%|          | 0/3190 [00:00<?, ? examples/s]

Map:   0%|          | 0/6382 [00:00<?, ? examples/s]

In [52]:
model = RobertaForSequenceClassification.from_pretrained(model_id, num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
import wandb
wandb.login()
from transformers.integrations import WandbCallback
os.environ["WANDB_PROJECT"]="roberta_base_politics"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

output_dir = "/shared/3/projects/bowenyi/where-is-news/595_486_Final_Project/model_output_roberta_base"   
seed = 1


[34m[1mwandb[0m: Currently logged in as: [33mbowenyi[0m ([33mblablablab-nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [50]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    conf_matrix = confusion_matrix(y_true=labels, y_pred=predictions)
    TN, FP, FN, TP = conf_matrix.ravel()
    
    TPR = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    FPR = FP / (FP + TN) if (FP + TN) > 0 else 0.0
    TNR = TN / (TN + FP) if (TN + FP) > 0 else 0.0
    FNR = FN / (TP + FN) if (TP + FN) > 0 else 0.0

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    accuracy = accuracy_score(labels, predictions)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'true_negative_rate': TNR,
        'false_positive_rate': FPR,
        'false_negative_rate': FNR,
        'true_positive_rate': TPR
    }

In [53]:
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    do_train=True,
    do_eval=True,
    seed=seed,
    save_strategy='steps',
    save_steps=100,
    evaluation_strategy='steps',
    eval_steps=100,
    logging_dir=output_dir + 'logs/',
    logging_strategy='steps',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    run_name='podcasts-study' + str(seed),
    report_to="wandb"
)

trainer = Trainer(
    model=model,    
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_dev,    
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [54]:
trainer.add_callback(WandbCallback())
trainer.train()
trainer.evaluate(ds_test)
wandb.finish()

You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback
NotebookProgressCallback


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,True Negative Rate,False Positive Rate,False Negative Rate,True Positive Rate
100,0.6855,0.664602,0.589342,0.539705,0.59305,0.495164,0.678462,0.321538,0.504836,0.495164
200,0.6629,0.646654,0.608777,0.634232,0.581408,0.697614,0.52471,0.47529,0.302386,0.697614
300,0.6433,0.647983,0.616301,0.607441,0.60434,0.610574,0.621721,0.378279,0.389426,0.610574
400,0.6293,0.633642,0.62069,0.659155,0.585293,0.754352,0.494204,0.505796,0.245648,0.754352
500,0.6028,0.645515,0.638871,0.577713,0.669499,0.508059,0.76266,0.23734,0.491941,0.508059
600,0.5998,0.629489,0.637304,0.649288,0.6127,0.690522,0.586943,0.413057,0.309478,0.690522
700,0.5898,0.635375,0.638245,0.637563,0.621555,0.654417,0.622941,0.377059,0.345583,0.654417
800,0.5532,0.651482,0.638245,0.612492,0.639103,0.588008,0.685784,0.314216,0.411992,0.588008
900,0.5504,0.646328,0.641379,0.646259,0.620915,0.673759,0.610738,0.389262,0.326241,0.673759
1000,0.5483,0.649646,0.647649,0.637887,0.637476,0.638298,0.656498,0.343502,0.361702,0.638298


[34m[1mwandb[0m: Adding directory to artifact (/shared/3/projects/bowenyi/where-is-news/595_486_Final_Project/model_output_roberta_base/checkpoint-100)... Done. 28.8s
[34m[1mwandb[0m: Adding directory to artifact (/shared/3/projects/bowenyi/where-is-news/595_486_Final_Project/model_output_roberta_base/checkpoint-100)... Done. 53.2s
[34m[1mwandb[0m: Adding directory to artifact (/shared/3/projects/bowenyi/where-is-news/595_486_Final_Project/model_output_roberta_base/checkpoint-200)... Done. 27.8s
[34m[1mwandb[0m: Adding directory to artifact (/shared/3/projects/bowenyi/where-is-news/595_486_Final_Project/model_output_roberta_base/checkpoint-200)... Done. 54.9s
[34m[1mwandb[0m: Adding directory to artifact (/shared/3/projects/bowenyi/where-is-news/595_486_Final_Project/model_output_roberta_base/checkpoint-300)... Done. 28.0s
[34m[1mwandb[0m: Adding directory to artifact (/shared/3/projects/bowenyi/where-is-news/595_486_Final_Project/model_output_roberta_base/checkpoint-

VBox(children=(Label(value='15256.371 MB of 15256.371 MB uploaded (32.141 MB deduped)\r'), FloatProgress(value…

0,1
eval/accuracy,▁▁▃▃▄▄▅▅▇▇▇▇▇▇▇▇▇▇██▄▄
eval/f1,▁▁▇▇▅▅██▃▃▇▇▇▇▅▅▇▇▇▇██
eval/false_negative_rate,██▃▃▅▅▁▁██▃▃▄▄▅▅▃▃▄▄▁▁
eval/false_positive_rate,▃▃▇▇▅▅██▁▁▅▅▅▅▃▃▅▅▄▄██
eval/loss,██▄▄▅▅▂▂▄▄▁▁▂▂▅▅▄▄▅▅▂▂
eval/precision,▂▂▁▁▃▃▂▂██▄▄▄▄▆▆▄▄▆▆▁▁
eval/recall,▁▁▆▆▄▄██▁▁▆▆▅▅▄▄▆▆▅▅██
eval/runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁██
eval/samples_per_second,▂▂▂▂▁▁▃▃▅▅▃▃▃▃▅▅▄▄▄▄██
eval/steps_per_second,▂▂▂▂▁▁▃▃▅▅▃▃▃▃▅▅▄▄▄▄██

0,1
eval/accuracy,0.61109
eval/f1,0.65082
eval/false_negative_rate,0.25435
eval/false_positive_rate,0.51616
eval/loss,0.63575
eval/precision,0.57738
eval/recall,0.74565
eval/runtime,23.2426
eval/samples_per_second,274.582
eval/steps_per_second,4.302


## This strategy achieved a better F-1 score. Our next step is to manually check 1000 samples predicted by RoBERTa. Then, we can set a proper decision threshold for right wing and left wing, instead of using the default 0.5. 

## We should have used a separate set of data for calibration. But due to high-quality data scarcity, we used the test set to calibrate

## We divided the entire test set to 10 ten-percent bins. Then, we randomly sampled 100 instanced in each 10 percent bin for manually checking

## 4.2 Produce file for calibration

In [62]:
import torch
from scipy.special import softmax 

In [None]:
predictions = trainer.predict(ds_test)

In [63]:
probs = softmax(predictions.predictions, axis=1)[:, 1]
df_test['prob'] = probs


In [64]:
df_test['stratum'] = pd.cut(df_test['prob'], bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 
                       labels=['0-10%', '10-20%', '20-30%', '30-40%', '40-50%', '50-60%', '60-70%', '70-80%', '80-90%', '90-100%'], 
                       include_lowest=True)

df_test['ground_truth'] = ''
df_test['note'] = ''
df_test_cp = df_test.copy()

In [65]:
annotate_df = df_test_cp.groupby('stratum').apply(lambda x: x.sample(n=100, random_state=1)).reset_index(drop=True)



In [66]:
annotate_df.to_csv('09_RoBERTa_annotate.csv')


## After manually checking, we decide 0.7 would be a decision threshold. If an input text receives score above 0.7, it's right-wing. If lower, it's left-wing. 

## 4.3 Out of curiosity, we tested a finetuned BERT-based model specifically for detecting political leaning. The model is introduced by an EMNLP article in 2020.

In [67]:
pol_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

pol_bert = AutoModelForSequenceClassification.from_pretrained("bucketresearch/politicalBiasBERT")


In [70]:
texts = [entry['text'] for entry in ds_test]  
tokenized_texts = tokenizer(texts, max_length=256, padding="max_length", truncation=True)

tokenized_texts_dataset = Dataset.from_dict(tokenized_texts)

polbert_trainer = Trainer(model=model)

polbert_predictions = polbert_trainer.predict(tokenized_texts_dataset)




In [71]:
from sklearn.metrics import f1_score, accuracy_score


In [75]:
polbert_probs = torch.nn.functional.softmax(torch.tensor(polbert_predictions.predictions), dim=-1).numpy()

predicted_labels = np.argmax(polbert_probs, axis=1)

true_labels = np.array([entry['label'] for entry in ds_test])  # Ensure the key 'label' matches the key in ds_test

f1 = f1_score(true_labels, predicted_labels)
accuracy = accuracy_score(true_labels, predicted_labels)

print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")

F1 Score: 0.6508159819921215
Accuracy: 0.6110937010341586


## Seems that the pretrained model didn't perform better than our own finetuned model. (Also to show effort), we'll use our own model

In [76]:
trainer.save_model(output_dir = "best_roberta")
