In [None]:
# Transformers installation
! pip install transformers
! pip install datasets

In [None]:
# loading the labeled data:

# import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_excel('/content/drive/MyDrive/Chapman PhD/CS799/data/cleaned/cleaned_training_data.xlsx')
df.head(100)

In [None]:
# seperating dependent and independent variables:

X = df['tweets']
y_lable = df['lable']

# just for debugging:
y_lable = list(y_lable)
unique_lables = np.unique(y_lable)
print(unique_lables)

['Bugs/Defects/Scam' 'Feature Release' 'Marketing' 'Other'
 'Partnership/Investment']


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_lable = list(le.fit_transform(y_lable))

In [None]:
# all data need to be as a list. We already have the y as the list.
X = list(X)

In [None]:
# importing libraries:

from transformers import AutoModelForSequenceClassification
#from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
#from scipy.special import softmax

In [None]:

MODEL = "cardiffnlp/twitter-roberta-base"
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=5)

tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer

X_train, X_val, y_train_lable, y_val_lable = train_test_split(X,  y_lable, test_size=0.2, random_state= 42)

#X_train, X_val, y_train_lable, y_val_lable = train_test_split(X_train,  y_train_lable, test_size=0.2, random_state= 42)


X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [None]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train_lable)
val_dataset = Dataset(X_val_tokenized, y_val_lable)

In [None]:
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# for saving the model on Huggingface:

! pip install huggingface_hub

from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [None]:
 #Define Trainer
args = TrainingArguments(
    output_dir="labeling_model_v3",
    push_to_hub=True,
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    seed=0,
    load_best_model_at_end=True,

    
)

labeling_trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# Train pre-trained model
labeling_trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
/content/labeling_model_v3 is already a clone of https://huggingface.co/CryptologistOfficial/labeling_model_v3. Make sure you pull the latest changes with `repo.git_pull()`.
***** Running training *****
  Num examples = 3758
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2350
  Number of trainable parameters = 124649477


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.587616,0.8
2,0.741200,0.701672,0.815957
3,0.416500,0.802164,0.830851
4,0.254400,0.926726,0.843617
5,0.140100,0.952214,0.841489


***** Running Evaluation *****
  Num examples = 940
  Batch size = 8
Saving model checkpoint to labeling_model_v3/checkpoint-470
Configuration saved in labeling_model_v3/checkpoint-470/config.json
Model weights saved in labeling_model_v3/checkpoint-470/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 940
  Batch size = 8
Saving model checkpoint to labeling_model_v3/checkpoint-940
Configuration saved in labeling_model_v3/checkpoint-940/config.json
Model weights saved in labeling_model_v3/checkpoint-940/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 940
  Batch size = 8
Saving model checkpoint to labeling_model_v3/checkpoint-1410
Configuration saved in labeling_model_v3/checkpoint-1410/config.json
Model weights saved in labeling_model_v3/checkpoint-1410/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 940
  Batch size = 8
Saving model checkpoint to labeling_model_v3/checkpoint-1880
Configuration saved in labeling_model_v3/checkpoint-18

TrainOutput(global_step=2350, training_loss=0.34127233140012053, metrics={'train_runtime': 1988.125, 'train_samples_per_second': 9.451, 'train_steps_per_second': 1.182, 'total_flos': 4943989896898560.0, 'train_loss': 0.34127233140012053, 'epoch': 5.0})

In [None]:
labeling_trainer.push_to_hub()

tokenizer.push_to_hub("labeling_model_v3") 


In [None]:
test_data = pd.read_excel('/content/drive/MyDrive/Chapman PhD/CS799/data/cleaned/cleaned_test_dataset.xlsx')
test_data

In [None]:
X_test = test_data['tweets']
y_test = test_data['lable']

In [None]:
# just for debugging:
y_test = list(y_test)
unique_lables = np.unique(y_test)
print(unique_lables)

['Bugs/Defects/Scam' 'Feature Release' 'Marketing' 'Other'
 'Partnership/Investment']


In [None]:
le = LabelEncoder()
y_test = list(le.fit_transform(y_test))

In [None]:
X_test = list(X_test)

In [None]:
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

In [None]:
# loading to Pytorch:

test_dataset = Dataset(X_test_tokenized)

In [None]:
predictions = labeling_trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

print(preds)

***** Running Prediction *****
  Num examples = 2269
  Batch size = 8


[1 4 3 ... 3 2 1]


In [None]:
# We can now compare those preds to the test labels for our fine tuned model:

from datasets import load_metric
import datasets

precision_metric = load_metric("precision")
accuracy_metric = load_metric("accuracy")
precision = precision_metric.compute(predictions=preds, references=y_test,average="weighted")["precision"] 
accuracy = accuracy_metric.compute(predictions=preds, references=y_test)["accuracy"] 
print('Our fine tuned model test precision is: ', precision)
print('Our fine tuned model test accuracy is: ', accuracy)

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Our fine tuned model test precision is:  0.7653362092924391
Our fine tuned model test accuracy is:  0.7183781401498458


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.37      0.93      0.53        14
           1       0.35      0.77      0.48       131
           2       0.72      0.75      0.73       799
           3       0.86      0.68      0.76      1209
           4       0.61      0.78      0.69       116

    accuracy                           0.72      2269
   macro avg       0.58      0.78      0.64      2269
weighted avg       0.77      0.72      0.73      2269

