In [3]:
import torch
import pandas as pd
import json
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU Available:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print(" No GPU found, using CPU")
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print("Using device:", device)

GPU Available: Tesla T4


In [39]:
from sklearn.preprocessing import LabelEncoder
from datasets import load_dataset
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

# Load dataset
dataset = load_dataset("clinc_oos", "plus")

# Fit LabelEncoder on all splits to avoid unseen labels
all_intents = dataset["train"]["intent"] + dataset["validation"]["intent"] + dataset["test"]["intent"]
encoder = LabelEncoder()
encoder.fit(all_intents)

# Encode labels for all splits
def encode_labels(example):
    example["label"] = encoder.transform([example["intent"]])[0]
    return example

dataset = dataset.map(encode_labels)

# Limit to 10,000 training examples
# dataset["train"] = dataset["train"].select(range(min(10000, len(dataset["train"]))))

# Now we have 10,000 examples from the training set
train_data = dataset["train"]

# Shuffle the dataset
train_data = train_data.shuffle(seed=42)

# Split into 80% for training and 20% for validation + test
train_size = int(0.8 * len(train_data))  # 80% for training
temp_data = train_data.select(range(train_size, len(train_data)))  # 20% left for validation and testing

# Split the remaining 20% into 50% for validation and 50% for testing (1,000 examples each)
valid_data, test_data = temp_data.train_test_split(test_size=0.5, shuffle=True, seed=42).values()

# Rebuild the dataset with the new splits
dataset["train"] = train_data.select(range(train_size))
dataset["validation"] = valid_data
dataset["test"] = test_data

# Show label mapping
label_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print("Label Mapping:", label_mapping)

# Tokenize text
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove unneeded columns
columns_to_remove = ["text", "intent"]
tokenized_datasets = tokenized_datasets.remove_columns([col for col in columns_to_remove if col in tokenized_datasets["train"].column_names])

# Check final dataset
print(tokenized_datasets)


Label Mapping: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 67, 68: 68, 69: 69, 70: 70, 71: 71, 72: 72, 73: 73, 74: 74, 75: 75, 76: 76, 77: 77, 78: 78, 79: 79, 80: 80, 81: 81, 82: 82, 83: 83, 84: 84, 85: 85, 86: 86, 87: 87, 88: 88, 89: 89, 90: 90, 91: 91, 92: 92, 93: 93, 94: 94, 95: 95, 96: 96, 97: 97, 98: 98, 99: 99, 100: 100, 101: 101, 102: 102, 103: 103, 104: 104, 105: 105, 106: 106, 107: 107, 108: 108, 109: 109, 110: 110, 111: 111, 112: 112, 113: 113, 114: 114, 115: 115, 116: 116, 117: 117, 118: 118, 119: 119, 120:

Map:   0%|          | 0/12200 [00:00<?, ? examples/s]

Map:   0%|          | 0/1525 [00:00<?, ? examples/s]

Map:   0%|          | 0/1525 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 12200
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1525
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1525
    })
})


In [40]:
dataset = dataset.map(tokenize_function, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
num_labels=len(label_mapping)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
# num_labels = len(df["broad_class"].unique())  # Get number of unique classes
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
num_labels=len(label_mapping)
# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/intent",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="none"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

# Epoch	Training Loss	Validation Loss	Accuracy	Precision	Recall	F1
# 1	0.679500	0.343185	0.948852	0.951513	0.948852	0.946804
# 2	0.117800	0.107977	0.979016	0.980854	0.979016	0.978865
# 3	0.036300	0.091681	0.979672	0.981326	0.979672	0.979593
# 4	0.017300	0.080347	0.982951	0.984648	0.982951	0.982997
# 5	0.009300	0.083100	0.981639	0.983798	0.981639	0.981764


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6795,0.343185,0.948852,0.951513,0.948852,0.946804
2,0.1178,0.107977,0.979016,0.980854,0.979016,0.978865
3,0.0363,0.091681,0.979672,0.981326,0.979672,0.979593
4,0.0173,0.080347,0.982951,0.984648,0.982951,0.982997
5,0.0093,0.0831,0.981639,0.983798,0.981639,0.981764


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=7625, training_loss=0.4547723792654569, metrics={'train_runtime': 1687.0417, 'train_samples_per_second': 36.158, 'train_steps_per_second': 4.52, 'total_flos': 4017811484928000.0, 'train_loss': 0.4547723792654569, 'epoch': 5.0})

In [None]:
trainer.evaluate()
metrics = trainer.evaluate()
print(metrics)
import json
print(json.dumps(metrics, indent=4))
# {'eval_loss': 0.0831003487110138, 'eval_accuracy': 0.981639344262295, 'eval_precision': 0.983798201155964, 'eval_recall': 0.981639344262295, 'eval_f1': 0.9817639314845429, 'eval_runtime': 13.0553, 'eval_samples_per_second': 116.811, 'eval_steps_per_second': 14.63, 'epoch': 5.0}
# {
#     "eval_loss": 0.0831003487110138,
#     "eval_accuracy": 0.981639344262295,
#     "eval_precision": 0.983798201155964,
#     "eval_recall": 0.981639344262295,
#     "eval_f1": 0.9817639314845429,
#     "eval_runtime": 13.0553,
#     "eval_samples_per_second": 116.811,
#     "eval_steps_per_second": 14.63,
#     "epoch": 5.0
# }

{'eval_loss': 0.0831003487110138, 'eval_accuracy': 0.981639344262295, 'eval_precision': 0.983798201155964, 'eval_recall': 0.981639344262295, 'eval_f1': 0.9817639314845429, 'eval_runtime': 13.0553, 'eval_samples_per_second': 116.811, 'eval_steps_per_second': 14.63, 'epoch': 5.0}
{
    "eval_loss": 0.0831003487110138,
    "eval_accuracy": 0.981639344262295,
    "eval_precision": 0.983798201155964,
    "eval_recall": 0.981639344262295,
    "eval_f1": 0.9817639314845429,
    "eval_runtime": 13.0553,
    "eval_samples_per_second": 116.811,
    "eval_steps_per_second": 14.63,
    "epoch": 5.0
}


In [45]:
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/mask_intent_classifier2")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/mask_intent_classifier2")
# predictions = trainer.predict(dataset["test"])

('/content/drive/MyDrive/Colab Notebooks/mask_intent_classifier2/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/mask_intent_classifier2/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/mask_intent_classifier2/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/mask_intent_classifier2/added_tokens.json')

In [38]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import json

model_path = "/content/drive/MyDrive/Colab Notebooks/mask_intent_classifier2"
# label_mapping_path = "/content/label_mapping.json"

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load label mapping
label_mapping = {
    0: 'restaurant_reviews',
    1: 'nutrition_info',
    2: 'account_blocked',
    3: 'oil_change_how',
    4: 'time',
    5: 'weather',
    6: 'redeem_rewards',
    7: 'interest_rate',
    8: 'gas_type',
    9: 'accept_reservations',
    10: 'smart_home',
    11: 'user_name',
    12: 'report_lost_card',
    13: 'repeat',
    14: 'whisper_mode',
    15: 'what_are_your_hobbies',
    16: 'order',
    17: 'jump_start',
    18: 'schedule_meeting',
    19: 'meeting_schedule',
    20: 'freeze_account',
    21: 'what_song',
    22: 'meaning_of_life',
    23: 'restaurant_reservation',
    24: 'traffic',
    25: 'make_call',
    26: 'text',
    27: 'bill_balance',
    28: 'improve_credit_score',
    29: 'change_language',
    30: 'no',
    31: 'measurement_conversion',
    32: 'timer',
    33: 'flip_coin',
    34: 'do_you_have_pets',
    35: 'balance',
    36: 'tell_joke',
    37: 'last_maintenance',
    38: 'exchange_rate',
    39: 'uber',
    40: 'car_rental',
    41: 'credit_limit',
    42: 'oos',
    43: 'shopping_list',
    44: 'expiration_date',
    45: 'routing',
    46: 'meal_suggestion',
    47: 'tire_change',
    48: 'todo_list',
    49: 'card_declined',
    50: 'rewards_balance',
    51: 'change_accent',
    52: 'vaccines',
    53: 'reminder_update',
    54: 'food_last',
    55: 'change_ai_name',
    56: 'bill_due',
    57: 'who_do_you_work_for',
    58: 'share_location',
    59: 'international_visa',
    60: 'calendar',
    61: 'translate',
    62: 'carry_on',
    63: 'book_flight',
    64: 'insurance_change',
    65: 'todo_list_update',
    66: 'timezone',
    67: 'cancel_reservation',
    68: 'transactions',
    69: 'credit_score',
    70: 'report_fraud',
    71: 'spending_history',
    72: 'directions',
    73: 'spelling',
    74: 'insurance',
    75: 'what_is_your_name',
    76: 'reminder',
    77: 'where_are_you_from',
    78: 'distance',
    79: 'payday',
    80: 'flight_status',
    81: 'find_phone',
    82: 'greeting',
    83: 'alarm',
    84: 'order_status',
    85: 'confirm_reservation',
    86: 'cook_time',
    87: 'damaged_card',
    88: 'reset_settings',
    89: 'pin_change',
    90: 'replacement_card_duration',
    91: 'new_card',
    92: 'roll_dice',
    93: 'income',
    94: 'taxes',
    95: 'date',
    96: 'who_made_you',
    97: 'pto_request',
    98: 'tire_pressure',
    99: 'how_old_are_you',
    100: 'rollover_401k',
    101: 'pto_request_status',
    102: 'how_busy',
    103: 'application_status',
    104: 'recipe',
    105: 'calendar_update',
    106: 'play_music',
    107: 'yes',
    108: 'direct_deposit',
    109: 'credit_limit_change',
    110: 'gas',
    111: 'pay_bill',
    112: 'ingredients_list',
    113: 'lost_luggage',
    114: 'goodbye',
    115: 'what_can_i_ask_you',
    116: 'book_hotel',
    117: 'are_you_a_bot',
    118: 'next_song',
    119: 'change_speed',
    120: 'plug_type',
    121: 'maybe',
    122: 'w2',
    123: 'oil_change_when',
    124: 'thank_you',
    125: 'shopping_list_update',
    126: 'pto_balance',
    127: 'order_checks',
    128: 'travel_alert',
    129: 'fun_fact',
    130: 'sync_device',
    131: 'schedule_maintenance',
    132: 'apr',
    133: 'transfer',
    134: 'ingredient_substitution',
    135: 'calories',
    136: 'current_location',
    137: 'international_fees',
    138: 'calculator',
    139: 'definition',
    140: 'next_holiday',
    141: 'update_playlist',
    142: 'mpg',
    143: 'min_payment',
    144: 'change_user_name',
    145: 'restaurant_suggestion',
    146: 'travel_notification',
    147: 'cancel',
    148: 'pto_used',
    149: 'travel_suggestion',
    150: 'change_volume'
}



# Intent prediction function
def predict_intent(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    intent = label_mapping.get(predicted_class, "Unknown Intent")

    return intent

# Take user input and predict intent
user_input = input("Enter a query: ")
print(f"Query: {user_input}")
predicted_intent = predict_intent(user_input)
print(f"Predicted Intent: {predicted_intent}")



Enter a query: tell me fun fact about you
Query: tell me fun fact about you
Predicted Intent: fun_fact
