<a href="https://colab.research.google.com/github/angelajyeung/text-sentiment-analysis-app/blob/main/model_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import pandas as pd
import numpy as np

test_data = pd.read_csv('test.csv')
test_labels = pd.read_csv('test_labels.csv')
test_data = pd.merge(test_data, test_labels, on='id')
test_data = test_data[test_data['toxic'] != -1] # remove rows where the label is -1
test_data = test_data.reset_index(drop=True)
test_df = test_data.head(100)

# Convert labels to multi-hot encoded format
test_df['labels'] = test_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist()

# Print the first 5 rows of the data frame
print(test_df.head())

                 id                                       comment_text  toxic  \
0  0001ea8717f6de06  Thank you for understanding. I think very high...      0   
1  000247e83dcc1211                   :Dear god this site is horrible.      0   
2  0002f87b16116a7f  "::: Somebody will invariably try to add Relig...      0   
3  0003e1cccfd5a40a  " \n\n It says it right there that it IS a typ...      0   
4  00059ace3e3e9a53  " \n\n == Before adding a new product to the l...      0   

   severe_toxic  obscene  threat  insult  identity_hate              labels  
0             0        0       0       0              0  [0, 0, 0, 0, 0, 0]  
1             0        0       0       0              0  [0, 0, 0, 0, 0, 0]  
2             0        0       0       0              0  [0, 0, 0, 0, 0, 0]  
3             0        0       0       0              0  [0, 0, 0, 0, 0, 0]  
4             0        0       0       0              0  [0, 0, 0, 0, 0, 0]  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['labels'] = test_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist()


In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import torch

# Load and preprocess the training data
train_data = pd.read_csv('train.csv')
train_data = train_data.head(100)
train_data = train_data.fillna('') # replace missing values with empty strings
train_data['text'] = train_data['comment_text'].str.lower().str.strip() # lowercase and remove leading/trailing spaces from text
train_data['labels'] = train_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist() # create a new column for multi-hot encoded labels

# Split the data into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Define the datasets and data collator
class ToxicCommentDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        comment_text = str(self.data.iloc[index]['comment_text'])
        labels = self.data.iloc[index]['labels']
        id = self.data.iloc[index]['id']
        encoding = self.tokenizer.encode_plus(
            comment_text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'id': id,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float32)
        }

train_dataset = ToxicCommentDataset(train_data, tokenizer)
val_dataset = ToxicCommentDataset(val_data, tokenizer)

def data_collator(data):
    batch = {}
    batch['input_ids'] = torch.stack([item['input_ids'] for item in data])
    batch['attention_mask'] = torch.stack([item['attention_mask'] for item in data])
    batch['token_type_ids'] = torch.stack([item['token_type_ids'] for item in data])
    batch['labels'] = torch.stack([item['labels'] for item in data])
    return batch

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=data_collator)


In [32]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from transformers import Trainer, TrainingArguments

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=50,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model='eval_roc_auc_score'
)

# Define the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate, eps=1e-8)
total_steps = len(train_dataloader) * training_args.num_train_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_steps)

# Define the evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    accuracy = accuracy_score(labels, preds)
    roc_auc = roc_auc_score(labels, pred.predictions, average='weighted', multi_class='ovr')
    return {'f1_score': f1, 'accuracy': accuracy, 'roc_auc_score': roc_auc}

# Define the Trainer object and start the training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=15, training_loss=0.7705146948496501, metrics={'train_runtime': 336.0987, 'train_samples_per_second': 0.714, 'train_steps_per_second': 0.045, 'total_flos': 15787230289920.0, 'train_loss': 0.7705146948496501, 'epoch': 3.0})

In [41]:
# Load the test data and preprocess it
test_data = pd.read_csv('test.csv')
test_data = test_data.fillna('')
test_data['text'] = test_data['comment_text'].str.lower().str.strip()

# Define the test dataset and data collator
class ToxicCommentTestDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.labels = torch.zeros((len(self.data), 6), dtype=torch.float32)  # create a dummy labels column with all zeros

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        comment_text = str(self.data.iloc[index]['comment_text'])
        id = self.data.iloc[index]['id']
        encoding = self.tokenizer.encode_plus(
            comment_text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'id': id,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'labels': self.labels[index]
        }

test_dataset = ToxicCommentTestDataset(test_df, tokenizer)
# Evaluate the model on the test dataset
# test_predictions = trainer.predict(test_dataset)