# Importing Model

In [18]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

# Parsing JSON File

In [19]:
import json

with open('intents.json','r') as f:
  data=json.load(f)

texts=[]
intents=[]
for item in data['rasa_nlu_data']['common_examples']:
  texts.append(item['text'])
  intents.append(item['intent'])


unique_intents=sorted(list(set(intents)))
intent_to_id={intent_name: i for i, intent_name in enumerate(unique_intents)}
id_to_intent={i: intent_name for i, intent_name in enumerate(unique_intents)}

labels=[intent_to_id[intent] for intent in intents]


# Tokenizing this thing

In [20]:
encodings =tokenizer(texts, truncation=True, padding=True, max_length=512)


# Converting the JSON to pytorch Dataset


In [21]:
import torch

class InvestmentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # This correctly creates a dictionary for a single item
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # This adds the corresponding single label
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
dataset=InvestmentDataset(encodings,labels)


# Making a data collator

In [22]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


# Setting up Trainer

In [23]:
from transformers import TrainingArguments,Trainer

num_labels=len(unique_intents)


model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased",num_labels=num_labels)

training_args=TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=4,
    learning_rate=5e-5,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none"
)

trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# *Making the model do hoola hoops*

Theres a value error so i need to do hoola hoops to make it work now.

Wrong model was being called hehe

In [24]:
trainer.train()

save_directory="./trained_model_0.1"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print("Hoola Hoops are done")

with open(f'{save_directory}/mappings.json', 'w') as f:
    json.dump({'intent_to_id': intent_to_id, 'id_to_intent': id_to_intent}, f)


Step,Training Loss
10,2.0076
20,1.7573
30,1.5518
40,1.3684
50,1.1171
60,0.9127
70,0.8399
80,0.8045


Hoola Hoops are done
