# Setup the prerequisites

In [None]:
# Upgrade pip and install all the required libraries
! pip install --upgrade pip
! pip install torch==2.3.0+cpu accelerate -f https://download.pytorch.org/whl/torch_stable.html
! pip install tf_keras tensorflow datasets evaluate transformers einops bitsandbytes sentence_transformers ipywidgets


### Check git-lfs is available

If we are using this notebook in a workbench in OpenShift AI, we may need to download git-lfs binary to work with large models and HugginFace.

**Please note**: the check is really basic, it just tries running the git-lfs binary, so if you have it installed just discard the output of the following code line.

In [None]:
# Download git-lfs and extract it in the "bin" directory
import requests
import tarfile
import shutil
import os


def download_and_extract(url, target_path, extract_dir):
    """Downloads a tar.gz file and extracts its contents.

    Args:
        url (str): URL of the tar.gz file.
        target_path (str): Path where the downloaded file will be saved.
        extract_dir (str): Directory where the contents will be extracted.
    """

    response = requests.get(url, stream=True)  # Stream for large files
    response.raise_for_status()  # Check for HTTP errors

    with open(target_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

    print(f"Downloaded to {target_path}")

    with tarfile.open(target_path, 'r:gz') as tar:
        tar.extractall(extract_dir)

    print(f"Extracted to {extract_dir}")

if os.system("git-lfs") != 0:
    # Clean up any previous download
    try:
        shutil.rmtree("./bin")
        shutil.rmtree("./git-lfs-3.5.1")
        os.remove("git-lfs-linux-amd64-v3.5.1.tar.gz")
    except Exception as e:
        pass
    
    # Our variabiles
    url = "https://github.com/git-lfs/git-lfs/releases/download/v3.5.1/git-lfs-linux-amd64-v3.5.1.tar.gz"
    target_path = "git-lfs-linux-amd64-v3.5.1.tar.gz"
    extract_dir = "./"

    # Download the file and extract it
    download_and_extract(url, target_path, extract_dir)

    # Create the ./bin directory 
    os.mkdir("./bin/")

    # Move the git-lfs binary in the ./bin directory
    shutil.move("./git-lfs-3.5.1/git-lfs", "./bin/")

    # Add ./bin directory to the PATH env variable
    current_path = os.environ.get('PATH', '')
    new_path = os.path.abspath(os.getcwd())+"/bin"+":"+current_path

    os.environ['PATH'] = new_path  # Update PATH in the current Python process


# Login to Hugging Face

Choose if you want to Login to your Hugging Face account so you can upload and share your model with the community. First of all choose if you want to login and if yes When prompted, please enter your token to login:

In [None]:
import ipywidgets as widgets
from huggingface_hub import login

dropdown = widgets.Dropdown(
    options=["True", "False"],
    value="False",
    description='Choose: ',
    disabled=False,
)

display(dropdown)

In [None]:
login2hf = (dropdown.value == 'True')

if login2hf:
    hf_token = input("Please enter your HuggingFace token: ")
    login(token=hf_token, write_permission=True)

## Load Tickets dataset

Start by loading the Tickets dataset from local repository:

In [None]:
from datasets import load_dataset

# I've created an HuggingFace dataset, but please refer to the followin (commented) line in case you want to load data from local csv files:
# tickets = load_dataset("csv", data_files={"train": "900tickets-300WS-300DB-300FS-train.csv", "test": "900tickets-300WS-300DB-300FS-test.csv"})

# Replace the dataset name with yours if you have any
tickets = load_dataset("alezzandro/itsm_tickets")

## Preprocess

The next step is to load a DistilBERT tokenizer to preprocess the text:

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Create a preprocessing function to tokenize the text and truncate sequences to be no longer than DistilBERT's maximum input length:

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

To apply the preprocessing function over the entire dataset, use 🤗 Datasets [map](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once:

In [None]:
tokenized_tickets = tickets.map(preprocess_function, batched=True)

Now create a batch of examples using [DataCollatorWithPadding](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DataCollatorWithPadding). It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluate

Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

Then create a function that passes your predictions and labels to [compute](https://huggingface.co/docs/evaluate/main/en/package_reference/main_classes#evaluate.EvaluationModule.compute) to calculate the accuracy:

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.

## Train

Before you start training your model, create a map of the expected ids to their labels with `id2label` and `label2id`:

In [None]:
id2label = {0: "WebServer", 1: "Database", 2: "Filesystem"}
label2id = {"WebServer": 0, "Database": 1, "Filesystem": 2}

<Tip>

If you aren't familiar with finetuning a model with the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer), take a look at the basic tutorial [here](https://huggingface.co/docs/transformers/main/en/tasks/../training#train-with-pytorch-trainer)!

</Tip>

You're ready to start training your model now! Load DistilBERT with [AutoModelForSequenceClassification](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForSequenceClassification) along with the number of expected labels, and the label mappings:

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

At this point, only three steps remain:

1. Define your training hyperparameters in [TrainingArguments](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments). The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) will evaluate the accuracy and save the training checkpoint.
2. Pass the training arguments to [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
3. Call [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train) to finetune your model.

In [None]:
training_args = TrainingArguments(
    output_dir="./models/itsm_tickets",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    hub_model_id="alezzandro/itsm_tickets",
    push_to_hub=login2hf,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_tickets["train"],
    eval_dataset=tokenized_tickets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

<Tip>

[Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) applies dynamic padding by default when you pass `tokenizer` to it. In this case, you don't need to specify a data collator explicitly.

</Tip>

Once training is completed, share your model to the Hub with the [push_to_hub()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.push_to_hub) method so everyone can use your model:

In [None]:
if login2hf:
    trainer.push_to_hub()

## Testing the model

Great, now that you've finetuned a model, you can use it!

Grab some text you'd like to test:

In [None]:
text = "Web server logs indicate multiple 404 Not Found errors for resources that should exist.  File paths appear correct in the codebase. Need to investigate potential caching issues, configuration mismatches, or incorrect deployments."

The simplest way to try out your finetuned model for inference is to use it in a [pipeline()](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.pipeline). Instantiate a `pipeline` for sentiment analysis with your model, and pass your text to it:

In [None]:
from transformers import pipeline

# You should replace the name of the model or load it from local directory in case you changed something.
classifier = pipeline("text-classification", model="alezzandro/itsm_tickets")
result = classifier(text)

print(result)

## Converting the model to OpenVINO format

First of all, let's download the required dependencies:

In [None]:
! pip install openvino
! pip install -q "git+https://github.com/huggingface/optimum-intel.git" onnx

Then we are ready to start the conversion:

In [None]:
from optimum.intel.openvino import OVModelForSequenceClassification
# If you are using a local model, replace the model name with yours or the directory path:
model = OVModelForSequenceClassification.from_pretrained("alezzandro/itsm_tickets", export=True)

os.makedirs("./models2s3/itsm_tickets_ovir")

model.save_pretrained("./models2s3/itsm_tickets_ovir")