<a href="https://colab.research.google.com/github/agdev/Routing/blob/main/Routing_Comparison_Finetuned_vs_Semantic_Router.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Installing dependencies**

In [32]:
!pip install datasets --quiet
!pip install transformers --quiet
!pip install transformers[torch] --quiet
!pip install -qU semantic-router
!pip install evaluate --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h

# **Importing**

In [2]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# **Load dataset**

In [3]:
dataset_name = "bitext/Bitext-customer-support-llm-chatbot-training-dataset"
# Load the dataset
base_ds = load_dataset(dataset_name, split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/19.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26872 [00:00<?, ? examples/s]

In [4]:
base_df = base_ds.to_pandas()
base_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26872 entries, 0 to 26871
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   flags        26872 non-null  object
 1   instruction  26872 non-null  object
 2   category     26872 non-null  object
 3   intent       26872 non-null  object
 4   response     26872 non-null  object
dtypes: object(5)
memory usage: 1.0+ MB


In [5]:
base_df.head()

Unnamed: 0,flags,instruction,category,intent,response
0,B,question about cancelling order {{Order Number}},ORDER,cancel_order,I've understood you have a question regarding ...
1,BQZ,i have a question about cancelling oorder {{Or...,ORDER,cancel_order,I've been informed that you have a question ab...
2,BLQZ,i need help cancelling puchase {{Order Number}},ORDER,cancel_order,I can sense that you're seeking assistance wit...
3,BL,I need to cancel purchase {{Order Number}},ORDER,cancel_order,I understood that you need assistance with can...
4,BCELN,"I cannot afford this order, cancel purchase {{...",ORDER,cancel_order,I'm sensitive to the fact that you're facing f...


In [6]:
base_df['category'].unique()

array(['ORDER', 'SHIPPING', 'CANCEL', 'INVOICE', 'PAYMENT', 'REFUND',
       'FEEDBACK', 'CONTACT', 'ACCOUNT', 'DELIVERY', 'SUBSCRIPTION'],
      dtype=object)

In [7]:
train_df, test_df = train_test_split(base_df, test_size=0.2, random_state=42)

# **Init Semantic Router**

In [8]:
from semantic_router import Route
from pandas import DataFrame
CATEGORY_COL: str = "category"
INSTRUCTION_COL: str = "instruction"

def create_routes(df: DataFrame, groupby_col: str):
  grouped = df.groupby(groupby_col)
  rts = []
  for category, group in grouped:
    rts.append(Route(name = category, utterances = group[INSTRUCTION_COL].tolist()))
  return rts

routes = create_routes(train_df, CATEGORY_COL)

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed


# **Init Encoder**

In [9]:
import os
from getpass import getpass
from semantic_router.encoders import HuggingFaceEncoder

from google.colab import userdata
hf_key = userdata.get('HuggingFace')

os.environ["HUGGINGFACE_API_KEY"] = hf_key or getpass(
    "Enter Hugging Face API Key: "
)

encoder = HuggingFaceEncoder()

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

# **Init RouterLayer**

In [10]:
from semantic_router.layer import RouteLayer

rl = RouteLayer(encoder=encoder, routes=routes)

# **Define Testing Inputs**

In [44]:
# text_samples = [
#     "I reqeust immediate refund",
#     "I was billed incorrectly",
#     "Where do I leave a tip",
#     "Not worth the money, would not buy again. I want to cancel order.",
#     "I would like to speak with the manager"
# ]
import io

csv_data = """Text,Category
"Can you expedite my shipping? I need the item urgently.","SHIPPING"
"I'd like to cancel my subscription. How do I do that?","SUBSCRIPTION"
"The product I received is damaged. I want a full refund.","REFUND"
"How can I update my billing information for future orders?","PAYMENT"
"I never received an invoice for my last purchase. Can you send it?","INVOICE"
"Your delivery person was extremely rude. I want to file a complaint.","FEEDBACK"
"I accidentally placed two identical orders. Please cancel one of them.","CANCEL"
"What's the status of my order? It's been a week and I haven't received any updates.","ORDER"
"How do I change the delivery address for my upcoming shipment?","DELIVERY"
"I forgot my account password. Can you help me reset it?","ACCOUNT"
"I'd like to speak with a customer service representative about my recent experience.","CONTACT"
"Can I get a copy of my payment receipt for tax purposes?","INVOICE"
"Your website charged me twice for the same item. Please fix this and refund me.","PAYMENT"
"I want to leave a positive review for the excellent service I received. Where can I do that?","FEEDBACK"
"My package was supposed to arrive yesterday but it's still not here. What's going on?","SHIPPING"
"How do I cancel my recurring monthly subscription?","SUBSCRIPTION"
"I received the wrong item in my order. What's the return process?","ORDER"
"Can you provide me with the contact information for your corporate office?","CONTACT"
"I'd like to change the payment method associated with my account.","ACCOUNT"
"The refund I was promised hasn't shown up in my bank account yet. Please look into this.","REFUND"
"Is it possible to get same-day delivery for my order?","DELIVERY"
"I'm having trouble accessing my online account. Can someone assist me?","ACCOUNT"
"Your product arrived late and now I don't need it. I want to return it for a full refund.","CANCEL"
"How long does it typically take for a refund to be processed?","REFUND"
"Can I get an itemized invoice for my last three orders?","INVOICE"
"""
# Using io.StringIO to simulate a file object
csv_file = io.StringIO(csv_data)

# Reading the CSV data into a DataFrame
text_samples_df = pd.read_csv(csv_file)
# print(text_samples_df.columns)
# Display the DataFrame
print(text_samples_df)

Index(['Text', 'Category'], dtype='object')
                                                 Text      Category
0   Can you expedite my shipping? I need the item ...      SHIPPING
1   I'd like to cancel my subscription. How do I d...  SUBSCRIPTION
2   The product I received is damaged. I want a fu...        REFUND
3   How can I update my billing information for fu...       PAYMENT
4   I never received an invoice for my last purcha...       INVOICE
5   Your delivery person was extremely rude. I wan...      FEEDBACK
6   I accidentally placed two identical orders. Pl...        CANCEL
7   What's the status of my order? It's been a wee...         ORDER
8   How do I change the delivery address for my up...      DELIVERY
9   I forgot my account password. Can you help me ...       ACCOUNT
10  I'd like to speak with a customer service repr...       CONTACT
11  Can I get a copy of my payment receipt for tax...       INVOICE
12  Your website charged me twice for the same ite...       PAYMENT
13  

In [28]:
from pprint import pprint

navs = []
for text in text_samples_df["Text"]:
  navs.append(rl(text))

pprint(navs)

[RouteChoice(name='SHIPPING', function_call=None, similarity_score=None),
 RouteChoice(name='SUBSCRIPTION', function_call=None, similarity_score=None),
 RouteChoice(name='REFUND', function_call=None, similarity_score=None),
 RouteChoice(name='ORDER', function_call=None, similarity_score=None),
 RouteChoice(name='INVOICE', function_call=None, similarity_score=None),
 RouteChoice(name='FEEDBACK', function_call=None, similarity_score=None),
 RouteChoice(name='ORDER', function_call=None, similarity_score=None),
 RouteChoice(name='DELIVERY', function_call=None, similarity_score=None),
 RouteChoice(name='SHIPPING', function_call=None, similarity_score=None),
 RouteChoice(name='ACCOUNT', function_call=None, similarity_score=None),
 RouteChoice(name='CONTACT', function_call=None, similarity_score=None),
 RouteChoice(name='INVOICE', function_call=None, similarity_score=None),
 RouteChoice(name=None, function_call=None, similarity_score=None),
 RouteChoice(name='FEEDBACK', function_call=None, si

In [13]:
# from transformers import AutoTokenizer
from huggingface_hub import login
login(token = hf_key, add_to_git_credential=True)

# # Load the tokenizer for ALBERT
# model_name: str = "vineetsharma/customer-support-intent-albert"
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# tokenizer.save_pretrained("AIEnthusiast369/customer-support-categ_classification-albert")

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# import os
# print(os.getcwd())

/content


# **Loading Model and Classfying data with Model **

In [51]:
# prompt: load this model "AIEnthusiast369/customer-support-categ_classification-albert" from hugging face and run the model to classify text_samples
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import traceback
from huggingface_hub import hf_hub_download
from transformers import pipeline

# classifier = pipeline(model="AIEnthusiast369/customer-support-categ_classification-albert")
# for text in text_samples:
#   print(classifier(text))
fine_tuned_model_name: str ="AIEnthusiast369/customer-support-categ_classification-albert_v2"
base_model_name: str = "vineetsharma/customer-support-intent-albert"
classifications=[]
label2Id = None
try:
    # model_name = "AIEnthusiast369/customer-support-categ_classification-albert"

    # # Use hf_hub_download with your token for private models
    # model_path = hf_hub_download(repo_id=model_name, filename='pytorch_model.bin', use_auth_token=hf_key)
    # tokenizer_path = hf_hub_download(repo_id=model_name, filename='tokenizer.json', use_auth_token=hf_key)

    model = AutoModelForSequenceClassification.from_pretrained(fine_tuned_model_name)
    tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_name)
    label2Id = model.config.label2id
    # Load the tokenizer for ALBERT
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

    # Load model directly
    # config = AutoConfig.from_pretrained(fine_tuned_model_name, revision="main")

    # tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_name, revision="main")
    # classifier = AutoModelForSequenceClassification.from_pretrained(fine_tuned_model_name, config=config,  revision="main")

    # classifier = pipeline(model="AIEnthusiast369/customer-support-categ_classification-albert")

    # num_labels = model.config.num_labels
    # print(f"Model is configured to classify {num_labels} labels.")

    for text in text_samples_df["Text"]:
        classifications.append(classifier(text))

    pprint(classifications)
except Exception as e:
    print(f"An error occurred: {e}")
    traceback.print_exc()

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[[{'label': 'DELIVERY', 'score': 0.999439537525177}],
 [{'label': 'SUBSCRIPTION', 'score': 0.9991925358772278}],
 [{'label': 'REFUND', 'score': 0.9999253749847412}],
 [{'label': 'ORDER', 'score': 0.8836807608604431}],
 [{'label': 'INVOICE', 'score': 0.9998661279678345}],
 [{'label': 'FEEDBACK', 'score': 0.9992051720619202}],
 [{'label': 'ORDER', 'score': 0.9996504783630371}],
 [{'label': 'ORDER', 'score': 0.9998818635940552}],
 [{'label': 'SHIPPING', 'score': 0.9993603825569153}],
 [{'label': 'ACCOUNT', 'score': 0.9999545812606812}],
 [{'label': 'CONTACT', 'score': 0.9998041987419128}],
 [{'label': 'PAYMENT', 'score': 0.9999163150787354}],
 [{'label': 'REFUND', 'score': 0.9999079704284668}],
 [{'label': 'FEEDBACK', 'score': 0.9998249411582947}],
 [{'label': 'DELIVERY', 'score': 0.9997318387031555}],
 [{'label': 'SUBSCRIPTION', 'score': 0.9991204142570496}],
 [{'label': 'ORDER', 'score': 0.9998646974563599}],
 [{'label': 'CONTACT', 'score': 0.9980756044387817}],
 [{'label': 'PAYMENT', '

# **Combining data for better visual**

In [57]:
import pandas as pd

# Extracting label and score
classification_data = [(item[0]['label'], item[0]['score']) for item in classifications]

# df_text = pd.DataFrame(text_samples_df, columns=['Text'])
# Create a DataFrame for classifications
df_classifications = pd.DataFrame(classification_data, columns=['Model_Label', 'Model_Score'])

# Extracting the names
nav_data = [(nav.name, nav.similarity_score) for nav in navs]

# Create a DataFrame for NAVs
df_navs = pd.DataFrame(nav_data, columns=['NAV_Name', 'NAV_Similarity_Score'])

# Concatenate the DataFrames
combined_df = pd.concat([text_samples_df, df_classifications, df_navs], axis=1)
combined_df.columns = combined_df.columns.str.strip()
labels_df = pd.concat([text_samples_df['Category'], df_classifications['Model_Label'], df_navs['NAV_Name']], axis=1)
# Display the result
# print(combined_df)

print(labels_df)


# for i in range(len(text_samples)):
#   print(f"Text: {text_samples[i]}")
#   print(f"Classification: {classifications[i]}")
#   print(f"Navigation: {navs[i]}")
#   print("-" * 20)


        Category   Model_Label      NAV_Name
0       SHIPPING      DELIVERY      SHIPPING
1   SUBSCRIPTION  SUBSCRIPTION  SUBSCRIPTION
2         REFUND        REFUND        REFUND
3        PAYMENT         ORDER         ORDER
4        INVOICE       INVOICE       INVOICE
5       FEEDBACK      FEEDBACK      FEEDBACK
6         CANCEL         ORDER         ORDER
7          ORDER         ORDER      DELIVERY
8       DELIVERY      SHIPPING      SHIPPING
9        ACCOUNT       ACCOUNT       ACCOUNT
10       CONTACT       CONTACT       CONTACT
11       INVOICE       PAYMENT       INVOICE
12       PAYMENT        REFUND          None
13      FEEDBACK      FEEDBACK      FEEDBACK
14      SHIPPING      DELIVERY      DELIVERY
15  SUBSCRIPTION  SUBSCRIPTION  SUBSCRIPTION
16         ORDER         ORDER         ORDER
17       CONTACT       CONTACT      FEEDBACK
18       ACCOUNT       PAYMENT       PAYMENT
19        REFUND        REFUND        REFUND
20      DELIVERY      DELIVERY      DELIVERY
21       A

In [34]:
import evaluate

metric = evaluate.load("accuracy")

def compute_accuracy(predictions, labels):

    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [54]:
def convert_labels_2_ids(labels):
  ids = []
  for label in labels:
    if label in label2Id:
      ids.append(label2Id[label])
    else:
      ids.append(-1)

  return ids

# **Calculating Accuracy**

In [55]:
# print(combined_df.columns)
print(compute_accuracy(convert_labels_2_ids(combined_df['Model_Label']), convert_labels_2_ids(combined_df['Category'])))
print(compute_accuracy(convert_labels_2_ids(combined_df['NAV_Name']), convert_labels_2_ids(combined_df['Category'])))

{'accuracy': 0.64}
{'accuracy': 0.64}
