**install dependencies**

In [None]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/peft.git git+https://github.com/huggingface/transformers.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml

**import libraries for cleaning and analyse**

In [None]:
import pandas as pd
import string
import re
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Data cleaning

In [None]:
df_train = pd.read_csv('train.csv')

In [None]:
#All columns uppercase
df_train.columns = [col.upper() for col in df_train.columns]

In [None]:
df_train.drop(['KEYWORD', 'LOCATION'], axis=1, inplace=True)

In [None]:
df_train.TARGET.value_counts()

0    4342
1    3271
Name: TARGET, dtype: int64

In [None]:
def remove_html(text) :
    html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
    return re.sub(html, "", text)

In [None]:
def remove_mentions(text):
    mention = "@[A-Za-z0-9_]+"
    return re.sub(mention,"", text)

In [None]:
def remove_emojis(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
def clean_text(text) :
    text = str(text).lower()
    text = remove_mentions(text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = remove_html(text)
    text = remove_emojis(text)
    return text

In [None]:
df_train['FINAL_TEXT'] = df_train['TEXT'].apply( lambda x:clean_text(x))

In [None]:
df_train.drop(['ID', 'TEXT'], axis=1, inplace=True)

In [None]:
df_train.rename(
    {'TARGET': 'label', 'FINAL_TEXT': 'text'}, axis=1, inplace=True)

In [None]:
df_train = df_train[['text', 'label']]

# Transform pandas dataframe to hugging face dataset

In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
from datasets import Dataset
df_train = Dataset.from_pandas(df_train)

**create label class**

In [None]:
from datasets import ClassLabel

# Creating a ClassLabel Object
df = df_train.to_pandas()
labels = ['disaster', 'not_disaster']
ClassLabels = ClassLabel(num_classes=len(labels), names=labels)

# Mapping Labels to IDs
def map_label2id(example):
    example['label'] = ClassLabels.str2int(example['label'])
    return example

df_train = df_train.map(map_label2id, batched=True)

# Casting label column to ClassLabel Object
df_train = df_train.cast_column('label', ClassLabels)

Map:   0%|          | 0/7613 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/7613 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer

model_id = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

**Tokenize our text**

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [None]:
data = df_train.map(tokenize_function, batched=True)

Map:   0%|          | 0/7613 [00:00<?, ? examples/s]

**split data into training and evaluation**

In [None]:
# Split the dataset into training and testing subsets
train_dataset = data.train_test_split(test_size=0.1, shuffle=True, seed=42)

In [None]:
# Access the training and testing subsets
training_data = train_dataset["train"]
val_data = train_dataset["test"]

In [None]:
training_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
num_labels = train_dataset['train'].features['label'].num_classes
class_names = train_dataset["train"].features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

number of labels: 2
the labels: ['disaster', 'not_disaster']


In [None]:
# Create an id2label mapping
id2label = {i: label for i, label in enumerate(class_names)}

In [None]:
from transformers import AutoConfig
# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

**import our base model**

In [None]:
from transformers import AutoModelForSequenceClassification
# Model
model = AutoModelForSequenceClassification.from_pretrained(model_id, config=config)

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**import LORA , lora is used to keep the base model and only train part of weight**

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)

model = get_peft_model(model, peft_config)
print_trainable_parameters(model)

trainable params: 1479172 || all params: 125534212 || trainable%: 1.1783018959006968


**add metrics**

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

**import trainer and train our model**

In [None]:
from transformers import (
    TrainingArguments,
    Trainer,
)

repository_id = "pigho/roberta-base_disaster"
# TrainingArguments
training_args = TrainingArguments(
    output_dir=repository_id,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    fp16=True,
    report_to="tensorboard",
)

In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics
)

In [None]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3784,0.387062,0.839895,0.804487,0.833887,0.77709


TrainOutput(global_step=857, training_loss=0.41958771369560177, metrics={'train_runtime': 339.4357, 'train_samples_per_second': 20.183, 'train_steps_per_second': 2.525, 'total_flos': 1821242767478784.0, 'train_loss': 0.41958771369560177, 'epoch': 1.0})

In [None]:
output_dir = 'model'

In [None]:
save_dir = f'{output_dir}/final'
trainer.save_model(save_dir)
print('saved model', save_dir)

saved model model/final


In [None]:
!zip -r ./model.zip ./model/

  adding: model/ (stored 0%)
  adding: model/final/ (stored 0%)
  adding: model/final/adapter_config.json (deflated 44%)
  adding: model/final/training_args.bin (deflated 49%)
  adding: model/final/adapter_model.bin (deflated 7%)
  adding: model/final/README.md (deflated 5%)
