<a href="https://colab.research.google.com/github/arielhsieh8/cs-uy-4613-project/blob/milestone-4/Model_FineTune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install pandas


In [1]:
#import necessary libraries 

from sklearn.model_selection import train_test_split
import torch 
from torch.utils.data import Dataset 
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments 
import pandas as pd
import numpy as np


In [32]:
# preprocess the data 

model_name = "distilbert-base-uncased" #use distilbert based model to train 

train_data = pd.read_csv('train.csv') # read in data 

train_data.drop(["id"], inplace=True, axis=1) #drop id column because it is not a feature 
train_data.dropna() # drop rows that do not have any data 

train_texts = train_data['comment_text'].tolist() # create X dataset and convert to list 
train_labels = train_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist() # create y dataset and convert to list 

test_texts, test_labels = train_texts[100000:101000],train_labels[100000:101000] # test data is 1000 of unused data in train.csv 
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts[:100000],train_labels[:100000],test_size=0.20,random_state=42) # split up the data into 80% train, 20% validation

In [3]:
# class textDataset to store a text and its associated labels 

class textDataset(Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels).float()

    def __getitem__(self,index):
        item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[index])
        return item

    def __len__(self): 
        return len(self.labels)

In [2]:
# define tokenizer from pretrained model 
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name,num_labels=6,problem_type="multi_label_classification")

#create the encodings of the training and validation dataset using the tokenizer 
train_encodings = tokenizer(train_texts,truncation=True,padding=True)
val_encodings = tokenizer(val_texts,truncation=True,padding=True)

#create the dataset of the training and validation sets with the class textDataset
train_dataset = textDataset(train_encodings,train_labels)
val_dataset = textDataset(val_encodings,val_labels)



In [None]:
#train the model with function, using 2 epochs and batch sizes of 16 
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
)

#define the model from pretrained imported model 
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=6,problem_type="multi_label_classification")

#define the trainer and the dataset it is to be trained on 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)



In [None]:
# train the model 
trainer.train()

In [None]:
# save the trained model in a directory 
save_directory = "saved"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

In [33]:
# test the model on the test dataset 

from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# import tokenizer 
tokenizer = AutoTokenizer.from_pretrained('Ariel8/toxic-tweets-classification')
# import model 
model = AutoModelForSequenceClassification.from_pretrained('Ariel8/toxic-tweets-classification')

p = []
threshold = 0.5 # set the threshold of determining whether something is a 0 or 1 
# loop through all the texts in the text dataset 
for i in range(len(test_texts)):
  #tokenize each text with the imported tokenizer 
  batch = tokenizer(test_texts[i], truncation=True, padding='max_length', return_tensors="pt") 
  with torch.no_grad():
    outputs = model(**batch) # run the text through the model to get its predicted output 
    predictions = torch.sigmoid(outputs.logits) # get the probability of the output by running through sigmoid function 
    p.append((predictions >= threshold).int()) # if probability > 0.5 --> 1, else --> 0



In [35]:
# calculate accuracy 

count = 0 
for i in range(len(p)): 
  x = np.concatenate(p[i].tolist(),axis=None) #change format of array to list and flatten 
  if list(x) == test_labels[i]: # if the list of predicted labels matches the given true labels, then increment the count 
    count += 1 
print('Accuracy: ', count/len(p)) # get the accuracy 

Accuracy:  0.936
