In [None]:
!pip install transformers pandas gradio
!pip install datasets

import pandas as pd
#import torch
from transformers import pipeline
import gradio as gr

Collecting gradio
  Downloading gradio-5.20.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3

In [None]:
# Load Hugging Face model (zero-shot-classification)
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
# classifier = pipeline("zero-shot-classification", model="./fine_tuned_bart")


# Define labels
# automation_labels = ["Self-Heal", "Automation", "Compliance", "Not Supported"]

automation_labels = [
    "Incident Detection",
    "Root Cause Analysis (RCA)",
    "Predictive Alerts",
    "Performance Optimization",
    "Security Threat Detection",
    "Change Impact Analysis",
    "Capacity Planning",
    "Anomaly Detection",
    "Automated Remediation",
    "Health Check",
    "No Support"
]

# Function to classify a single ticket
def classify_inc(problem_desc):
    result = classifier(problem_desc, automation_labels)

    best_label = result["labels"][0]  # Highest confidence label
    confidence = result["scores"][0]  # Confidence score

    # Determine if automation is possible (Yes/No)
    automation_possible = "Y" if best_label != "No Support" else "N"

    return automation_possible, best_label

# Function to process uploaded CSV
def process_csv(file):

    df = pd.read_csv(file)

    # Ensure the target column exists
    if "Description" not in df.columns:
        return "Error: CSV must contain a 'Description' column."

    # Apply model to each row
    df[["AUTOMATION Y/N", "IGNIO SOLUTION"]] = df["Description"].apply(
        lambda x: pd.Series(classify_inc(str(x)))
    )

    # Save the processed file
    output_file = "processed_inc.csv"
    df.to_csv(output_file, index=False)

    return output_file

# Gradio interface
def upload_and_process(file):
    output_file = process_csv(file.name)
    return output_file

iface = gr.Interface(
    fn=upload_and_process,
    inputs=gr.File(label="Upload INC CSV"),
    outputs=gr.File(label="Download Processed CSV"),
    title="Ticket Analysis Tool",
    description="Upload a CSV of customer INCs, and the program will analyze A. Whether or not an AIOps tool can automate the incident and B. Which specific feature will support it.",
)

iface.launch(share=True)  # Shareable demo link

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2417184538151cc085.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from datasets import DatasetDict

# Load pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")

# Load your dataset
dataset = load_dataset('csv', data_files='./incident_dataset.csv', delimiter=',')

# Tokenize the dataset with padding and truncation
def tokenize_function(examples):
    return tokenizer(
        examples['Description'],
        padding="max_length",
        truncation=True,
        max_length=256  # Try reducing max_length if needed
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Label mapping (ensure your 'Solution Label' column is mapped correctly)
label_map = {
 "Incident Detection": 0,
    "Root Cause Analysis (RCA)": 1,
    "Predictive Alerts": 2,
    "Performance Optimization": 3,
    "Security Threat Detection": 4,
    "Change Impact Analysis": 5,
    "Capacity Planning": 6,
    "Anomaly Detection": 7,
    "Automated Remediation": 8,
    "Health Check": 9,
    "No Support": 10,
}

def map_labels(examples):
    examples['labels'] = [label_map.get(label.strip(), -1) for label in examples['Automation Label']]
    return {"labels": examples['labels']}  # Return a dictionary with only the labels


tokenized_datasets = tokenized_datasets.map(map_labels, batched=True)


# Use this to use the train split directly
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["train"]

train_test_valid_dataset = tokenized_datasets["train"].train_test_split(test_size=0.2, seed=42) # seed for reproducibility

tokenized_datasets = DatasetDict({
    'train': train_test_valid_dataset['train'],
    'test': train_test_valid_dataset['test']
})

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

# Load the pre-trained BART model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    "facebook/bart-large-mnli",
    num_labels=11,  # Set the number of labels to 11 (your automation labels)
    ignore_mismatched_sizes=True  # Ignore size mismatch if any
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to=["none"]
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()




Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([11]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([11, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0037,0.001838
2,0.0014,0.000702
3,0.0011,0.000585


TrainOutput(global_step=300, training_loss=0.18693294270895422, metrics={'train_runtime': 301.1627, 'train_samples_per_second': 3.985, 'train_steps_per_second': 0.996, 'total_flos': 652086785433600.0, 'train_loss': 0.18693294270895422, 'epoch': 3.0})

In [None]:
model.save_pretrained("./fine_tuned_bart")
tokenizer.save_pretrained("./fine_tuned_bart")

('./fine_tuned_bart/tokenizer_config.json',
 './fine_tuned_bart/special_tokens_map.json',
 './fine_tuned_bart/vocab.json',
 './fine_tuned_bart/merges.txt',
 './fine_tuned_bart/added_tokens.json',
 './fine_tuned_bart/tokenizer.json')

In [None]:
from transformers import BartForSequenceClassification, BartTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

# Load your fine-tuned model and tokenizer
model = BartForSequenceClassification.from_pretrained("./fine_tuned_bart")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

# Assuming you have your eval_dataset prepared
eval_dataset = eval_dataset  # Replace with your actual eval dataset

# Define a compute_metrics function to calculate accuracy
def compute_metrics(p):
    logits, labels = p
    predictions = logits.argmax(axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=8,  # Adjust the batch size as needed
    do_eval=True,
    evaluation_strategy="epoch"  # You can change this to 'steps' if you prefer
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,  # Pass the compute_metrics function here
)

# Run evaluation
eval_results = trainer.evaluate()

# Print the evaluation results
print(eval_results)


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1', '2': 'LABEL_2', '3': 'LABEL_3', '4': 'LABEL_4', '5': 'LABEL_5', '6': 'LABEL_6', '7': 'LABEL_7', '8': 'LABEL_8', '9': 'LABEL_9', '10': 'LABEL_10'}. The number of labels wil be overwritten to 11.


AttributeError: 'tuple' object has no attribute 'argmax'

In [None]:
import shutil

shutil.make_archive("fine_tuned_bart", 'zip', "./fine_tuned_bart")
from google.colab import files
files.download("fine_tuned_bart.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

# Load the fine-tuned model and tokenizer from local storage
model = AutoModelForSequenceClassification.from_pretrained("./fine_tuned_bart")
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_bart")

# Initialize the classifier with the fine-tuned model
classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)

# Define the automation labels
automation_labels = [
    "Incident Detection",
    "Root Cause Analysis (RCA)",
    "Predictive Alerts",
    "Performance Optimization",
    "Security Threat Detection",
    "Change Impact Analysis",
    "Capacity Planning",
    "Anomaly Detection",
    "Automated Remediation",
    "Health Check",
    "No Support"
]

# Function to classify a single ticket
def classify_inc(problem_desc):
    result = classifier(problem_desc, automation_labels)

    best_label = result["labels"][0]  # Highest confidence label
    confidence = result["scores"][0]  # Confidence score

    # Determine if automation is possible (Yes/No)
    automation_possible = "Y" if best_label != "No Support" else "N"

    return automation_possible, best_label

# Function to process uploaded CSV
def process_csv(file):

    df = pd.read_csv(file)

    # Ensure the target column exists
    if "Description" not in df.columns:
        return "Error: CSV must contain a 'Description' column."

    # Apply model to each row
    df[["AUTOMATION Y/N", "IGNIO SOLUTION"]] = df["Description"].apply(
        lambda x: pd.Series(classify_inc(str(x)))
    )

    # Save the processed file
    output_file = "processed_inc.csv"
    df.to_csv(output_file, index=False)

    return output_file

# Gradio interface
import gradio as gr

def upload_and_process(file):
    output_file = process_csv(file.name)
    return output_file

iface = gr.Interface(
    fn=upload_and_process,
    inputs=gr.File(label="Upload INC CSV"),
    outputs=gr.File(label="Download Processed CSV"),
    title="Ticket Analysis Tool",
    description="Upload a CSV of customer INCs, and the program will analyze A. Whether or not an AIOps tool can automate the incident and B. Which specific feature will support it.",
)

iface.launch(share=True)  # Shareable demo link


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1', '2': 'LABEL_2', '3': 'LABEL_3', '4': 'LABEL_4', '5': 'LABEL_5', '6': 'LABEL_6', '7': 'LABEL_7', '8': 'LABEL_8', '9': 'LABEL_9', '10': 'LABEL_10'}. The number of labels wil be overwritten to 11.
Device set to use cuda:0
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://29ad4943cc744da42f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


