In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoConfig, TrainingArguments, Trainer
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import evaluate
from transformers import pipeline
from transformers import AutoModelForSequenceClassification, AutoConfig
import warnings

In [None]:
import yaml
import logging
from datetime import datetime

# YAML config
try:
    with open(r".\config.yaml", "r") as f:
        config = yaml.safe_load(f)
except Exception as e:
    raise

# Logger
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(name)s - %(funcName)s - %(message)s",
    filename=config["log_dir"] +
    f"{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.log",
    filemode="w"
)
logger = logging.getLogger(__name__)

logger.info("Config file and logger setup completed.")

In [None]:
def create_dataset():
    """
    Returns DatasetDict with splits.
    """
    try:
        dataframe = pd.read_csv(config["data_path"])
        train, test = train_test_split(
            dataframe, test_size=0.3, stratify=dataframe["label_name"])
        test, validation = train_test_split(
            test, test_size=1/3, stratify=test["label_name"])

        dataset = DatasetDict({"train": Dataset.from_pandas(train, preserve_index=False),
                               "test": Dataset.from_pandas(test, preserve_index=False),
                               "validation": Dataset.from_pandas(validation, preserve_index=False)})
        return dataset
    except Exception as e:
        logger.error(f"Failed to create dataset: {e}")
        raise

In [None]:
def tokenize(batch):
    """
    Tokenizes the input text using the tokenizer.

    Args:
        batch (pandas.DataFrame): A batch of text data.

    Returns:
        dict: A dictionary containing the tokenized data.
    """
    try:
        # Add max_length for padding
        temp = tokenizer(batch["text"], padding=True,
                         truncation=True, max_length=128)
        return temp
    except Exception as e:
        logger.error(f"Error tokenizing data: {e}")
        return None

In [None]:
def create_label_index(dataset):
    """
    Creates label2id and id2label.
    """
    try:
        label2id = {x["label_name"]: x["label"] for x in dataset["train"]}
        id2label = {v: k for k, v in label2id.items()}
        return label2id, id2label
    except Exception as e:
        logger.error(f"Failed to create label and index: {e}")

In [None]:
def create_model():
    """
    Creates model with certain configuration.
    """
    try:
        num_labels = len(label2id)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        config = AutoConfig.from_pretrained(
            config["base_model"], label2id=label2id, id2label=id2label)
        model = AutoModelForSequenceClassification.from_pretrained(
            config["base_model"], config=config).to(device)
        return model
    except Exception as e:
        logger.error(f"Failed to create model: {e}")

In [None]:
def train_model(dataset_encoded):
    """
    Trains the model.

    Args:
        dataset_encoded: Encoded dataset variable.
    """
    try:
        training_args = TrainingArguments(output_dir=config["output_dir"],
                                          overwrite_output_dir=True,
                                          #   num_train_epochs = 2,
                                          max_steps=5,
                                          learning_rate=2e-5,
                                          per_device_train_batch_size=64,
                                          per_device_eval_batch_size=64,
                                          weight_decay=0.01,
                                          eval_strategy="epoch",
                                          disable_tqdm=False,
                                          report_to="none")
        trainer = Trainer(model=model,
                          args=training_args,
                          compute_metrics=compute_metrics_evaluate,
                          train_dataset=dataset_encoded["train"],
                          eval_dataset=dataset_encoded["validation"],
                          processing_class=tokenizer)
        trainer.train()
        return trainer
    except Exception as e:
        logger.error(f"Failed to start training: {e}")

In [None]:
accuracy = evaluate.load("accuracy")


def compute_metrics_evaluate(eval_pred):
    """Evaluate metrics."""
    try:
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return accuracy.compute(predictions=predictions, references=labels)
    except Exception as e:
        logger.error(f"Error while evaluate: {e}")


def compute_metrics(pred):
    """
    Return accuracy and F1.
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)

    return {"accuracy": acc, "f1": f1}

In [None]:
dataset = create_dataset()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config["base_model"])

In [None]:
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)

In [None]:
label2id, id2label = create_label_index(dataset)

In [None]:
model = create_model()

In [None]:
trainer = train_model(dataset_encoded)

In [None]:
trainer.save_model("name")

In [None]:
def custom_prediction(model, query):
    """
    Return sentiment prediction of query.
    """
    try:
        classifier = pipeline("text-classification", model=model)
        return classifier(query)
    except Exception as e:
        logger.error(f"Prediction failed: {e}")

In [None]:
custom_prediction("name", "query")