In [None]:
!pip install -q transformers datasets wandb unicode_tr

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import os

import pandas as pd
import matplotlib.pyplot as plt

from datasets import load_dataset

from unicode_tr import unicode_tr

from collections import Counter

import wandb

In [None]:
!cat /root/.netrc

In [None]:
os.environ["WANDB_ENTITY"] = "deprem-ml"
os.environ["WANDB_PROJECT"] = "intent-classification"
os.environ["WANDB_LOG_MODEL"] = "false"
os.environ["WANDB_API_KEY"] = "<API_KEY>"

wandb.login(relogin=True)

### Checkout Train - Test

In [None]:
import re 


def normalize_phone_number(text: str) -> str:
    pattern = r"[\d]?[\s]?\(?[\d]{3}\)?[\s]?[\d]{3}[\s]?[\d]{2}[\s]?[\d]{2}[\s]?"

    text = re.sub(pattern, "", text)
    return text

def normalize_text(text: str) -> str:
    # unicode_tr clean
    text = unicode_tr(text)
    # remove hashtags
    text = re.sub(r"(#\w+)", "", text)

    text = re.sub(r"!+(?=.*\!)", "", text)
    text = text.replace("!", " ! ")

    # normalize whitespaces
    text = " ".join(text.split())

    # normalize phone numbers
    text = normalize_phone_number(text)

    # remove quotation
    text = text.lstrip('"').rstrip('"')
    return text.lower().strip()

In [None]:
DATASET_NAME_OR_PATH = "deprem-private/deprem_intent_classification"
intent = load_dataset(DATASET_NAME_OR_PATH, name='intent_multilabel')
i = 0
for instance in intent["train"]:
    if i > 13:
        break
    print(instance["text"])
    print(instance["text_cleaned"])
    print(normalize_text(instance["text_cleaned"]))
    print("=" * 60)
    i += 1

In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizer


class IntentDataset(Dataset):
    
    def __init__(self, dataset_name_or_path: str, split: str, tokenizer: PreTrainedTokenizer, name: str = "intent_multilabel", load_on_init: bool = True, **kwargs):
        self.dataset_name_or_path = dataset_name_or_path
        self.split = split
        self.tokenizer = tokenizer
        self.name = name
        self._dataset = None
        if load_on_init:
            self.load(**kwargs)

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        instance = self.dataset[idx]
        instance_text = normalize_text(instance["text_cleaned"])
        encoding = self.tokenizer(instance_text, max_length=128, padding="max_length", truncation=True)
        encoding = {key: torch.tensor(val) for key, val in encoding.items()}
        encoding["labels"] = torch.tensor(self._encode_label(instance["labels"]))
        return dict(encoding)

    @property
    def dataset(self):
        if self._dataset is None:
            raise AttributeError("Dataset is not loaded.")
        return self._dataset

    @property
    def num_classes(self):
        return len(intent["train"].features["labels"].feature.names)

    def load(self, **kwargs):
        if self._dataset is None:
            self._dataset = load_dataset(self.dataset_name_or_path, name=self.name, split=self.split, **kwargs)

    def _encode_label(self, labels):
        encoded_labels = np.zeros(self.num_classes)
        for label in labels:
            encoded_labels[label] = 1.0
        return encoded_labels

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer


model_name = "loodos/bert-base-turkish-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
ds = IntentDataset("deprem-private/deprem_intent_classification", split="train", name="intent_multilabel", tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=ds.num_classes, problem_type="multi_label_classification")

In [None]:
import numpy as np

from datasets import load_metric
from transformers import TrainingArguments, Trainer

In [None]:
train_set = IntentDataset(DATASET_NAME_OR_PATH, split="train", tokenizer=tokenizer)
dev_set = IntentDataset(DATASET_NAME_OR_PATH, split="validation", tokenizer=tokenizer)
test_set = IntentDataset(DATASET_NAME_OR_PATH, split="test", tokenizer=tokenizer)

In [None]:
TRAIN_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 32
EVAL_PER_EPOCH = 3

EVAL_STEP_SIZE = int(np.ceil(len(train_set) / TRAIN_BATCH_SIZE) / EVAL_PER_EPOCH) - 1

training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    report_to="wandb",
    run_name="loodos_bert-base-turkish-uncased_deprem_intent_clf",  # wandb run name
    num_train_epochs=5,
    learning_rate=3.5e-5,
    evaluation_strategy="steps",
    eval_steps=EVAL_STEP_SIZE,
    logging_strategy="steps",
    logging_steps=EVAL_STEP_SIZE,
    do_train=True,
    do_eval=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=dev_set,
)

trainer.train()

wandb.finish()  # move this to lower cells (after evaluation) if you want to log test set metrics.

In [None]:
preds = trainer.predict(test_set)

In [None]:
from sklearn.metrics import classification_report, f1_score

In [None]:
predictions, label_ids, metrics = preds

In [None]:
sigmoid = lambda x: 1 / (1 + np.exp(-x)) 

In [None]:
print((sigmoid(predictions[-2]) > 0.3).astype(int))
print(preds.label_ids[-2].astype(int))

In [None]:
y_true = preds.label_ids.astype(int)
y_pred = sigmoid(preds.predictions)
print(classification_report(y_true=y_true, y_pred=(y_pred > 0.3).astype(int), target_names = test_set.dataset.features["labels"].feature.names))

In [None]:
for threshold in np.arange(.1, 1., .05):
    score = f1_score(preds.label_ids.astype(int), (preds.predictions > threshold).astype(int), average="weighted")
    print(threshold, score)

In [None]:
from huggingface_hub import HfApi, notebook_login

In [None]:
api = HfApi()
api.upload_folder(folder_path="/content/output",
    path_in_repo="./",
    repo_id="deprem-ml/deprem-loodos-bert-base-uncased",
    repo_type="model",
    
    create_pr=1
)