In [1]:
import pandas as pd
#Regular Expressions - text pattern matching and cleaning text
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_recall_fscore_support
from sklearn.metrics import roc_auc_score,roc_curve 
import torch
#transformers
#prepares tweet-like text for models trained on Twitter data
#BertweetTokenizer: prepares tweet-like text for models trained on Twitter data.
#RobertaForSequenceClassification: pre-trained RoBERTa model for classification tasks.
#Trainer & TrainingArguments: handle model training, evaluation, and optimization.
from transformers import BertweetTokenizer,RobertaForSequenceClassification,Trainer,TrainingArguments
#DataCollatorWithPadding: automatically pads variable-length text batches.
#EarlyStoppingCallback: stops training early if validation loss doesn’t improve.
from transformers import DataCollatorWithPadding,EarlyStoppingCallback
#From Hugging Face Datasets, used to easily create and manage datasets.
#Can convert pandas DataFrames into a dataset for use with transformers.
from datasets import Dataset
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
#load data

df = pd.read_csv("Datasets/hydrogen_small.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   1000 non-null   object
 1   text    1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


In [11]:
df

Unnamed: 0,label,text
0,Relevant,behind the wheel of a hydrogen powered car
1,Irrelevant,mls measurements of stratospheric hydrogen cya...
2,Relevant,ana gonzalez hernandez shares an overview of h...
3,Relevant,toyota is giving away its first hydrogen car i...
4,Relevant,agility develops storage systems for hydrogen ...
...,...,...
995,Relevant,a solarpower europe report sees europe achievi...
996,Relevant,alstom engiegroup sign a partnership to supply...
997,Relevant,colombia signs a hydrogen mou with the port of...
998,Irrelevant,mr wizard s mini hydrogen bomb via youtube


In [12]:
#a label indicating whether it is relevant
df["label"].unique()

array(['Relevant', 'Irrelevant'], dtype=object)

In [13]:
#examine the messages in the dataset to determine whether any cleaning or pre-processing is necessary
df["text"].iloc[0]

'behind the wheel of a hydrogen powered car'

In [14]:
def clean_message(text):
    #remove URLs from the message.
    #re.sub(pattern, replacement, text)
    #It returns a new string with the replacements made.
    #pattern: the regular expression to search for (what you want to remove or replace).
    #replacement: the string to replace the found pattern with.

    #text: the original text to process.
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    #remove tags from the message
    text = re.sub(r"@\w+", "", text)

    #remove non-ASCII characters from the text
    #like: emojis
    text = re.sub(r"[^\x00-\x7F]+", "", text)

    #replace any sequences of consecutive whitespace with a single space character.
    #This includes newlines and spaces.
    text = re.sub(r"\s+", " ", text)

    #remove whitespace from the start and end of the string
    return text.strip()

In [15]:
#apply it yo the messages in the dataset

df["text"] = df["text"].apply(clean_message)

In [16]:
#examing the message from above to confirm that the tag and URL have been removed
df["text"]

0             behind the wheel of a hydrogen powered car
1      mls measurements of stratospheric hydrogen cya...
2      ana gonzalez hernandez shares an overview of h...
3      toyota is giving away its first hydrogen car i...
4      agility develops storage systems for hydrogen ...
                             ...                        
995    a solarpower europe report sees europe achievi...
996    alstom engiegroup sign a partnership to supply...
997    colombia signs a hydrogen mou with the port of...
998           mr wizard s mini hydrogen bomb via youtube
999    hand downs the best explanation ive heard abou...
Name: text, Length: 1000, dtype: object

In [17]:
#pre-processing

from sklearn.preprocessing import LabelEncoder

# create encoder
le = LabelEncoder()

# learn the mapping
le.fit(df["label"])

# transform the original column into numbers
df["label"] = le.transform(df["label"])


df

Unnamed: 0,label,text
0,1,behind the wheel of a hydrogen powered car
1,0,mls measurements of stratospheric hydrogen cya...
2,1,ana gonzalez hernandez shares an overview of h...
3,1,toyota is giving away its first hydrogen car i...
4,1,agility develops storage systems for hydrogen ...
...,...,...
995,1,a solarpower europe report sees europe achievi...
996,1,alstom engiegroup sign a partnership to supply...
997,1,colombia signs a hydrogen mou with the port of...
998,0,mr wizard s mini hydrogen bomb via youtube


In [None]:
df["label"].value_counts()
#Dataset is much smaller than that in the practical, no need to undersample then.

label
1    547
0    453
Name: count, dtype: int64

In [22]:
#Split these samples into a training/testing set that we will feed into the model.

X = df["text"].values
y = df["label"].values

In [24]:
random_state = 42

test_set_size = 0.3 # 30%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, stratify=y, random_state=random_state)
print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))

Training set size: 700
Testing set size: 300


In [25]:
#tokenizing data

train_df = pd.DataFrame({"text":X_train, "label": y_train})
test_df = pd.DataFrame({"text": X_test, "label":y_test})

#converting DataFrames into Hugging Face datasets
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

In [26]:
#download the embedding model

model_name = "vinai/bertweet-base"
tokenizer = BertweetTokenizer.from_pretrained(model_name)

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [27]:
#compute the tokens for each of the messages in our dataset

def tokenize(batch):
#batch['text'] takes the "text" column from the dataset
#tokenizer(...) applies the Hugging Face tokenizer
#truncation=True cuts off text longer than the model’s max length.
#padding=True pads shorter texts so that all sequences in a batch have the same length.
    return tokenizer(batch['text'],truncation=True,padding=True)

train_ds = train_ds.map(tokenize,batched=True)
test_ds = test_ds.map(tokenize,batched=True)

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [28]:
train_ds

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 700
})

In [30]:
# Loading the Model
# With the data ready to train the model we are using for transfer learning
# need to download and load the base model.

#in case there is a model assigned to this variable name

try:
    del model
except NameError:
    pass

#Download/load the base model. We use the "vinai/bertweet-base" model here.
#set the number of labels to 2
#set the problem type to single label classification
#loads a RoBERTa model that’s already trained on language data (from Hugging Face)

model = RobertaForSequenceClassification.from_pretrained(
    model_name,
    #sets how many output classes the model should predict
    num_labels=df["label"].nunique(),
    problem_type="single_label_classification" 
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
#Training the Model
#define the metrics to compute after each epoch
#define a metrics computing function compatible with the `transformers` library

#This function tells the Hugging Face Trainer how to evaluate model after each epoch.

# pred is an object the Trainer provides.
# It contains:
# pred.predictions: the raw model outputs (logits) — basically unnormalized scores for each class.
# pred.label_ids: the true labels from dataset.

def compute_metrics(pred):
    labels = pred.label_ids
    #convert model outputs to predicted classes
    preds = pred.predictions.argmax(-1) #argmax(-1) picks the index of the highest value (largest logit) along the last axis.
    acc = accuracy_score(labels,preds)
    prec,recall,f1,_ = precision_recall_fscore_support(
        labels, preds, average="binary", pos_label=1
    )
    return {
        "accuracy": acc,
        "precision": prec,
        "recall": recall,
        "f1":f1
    }


In [None]:
#set up hyperparameters for the training process

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10
)