### Domain Classification

In [1]:
import torch
from torch import nn
from transformers import AutoModel, AutoTokenizer, AutoConfig
from huggingface_hub import PyTorchModelHubMixin
import json
import pandas as pd
from tqdm import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class CustomModel(nn.Module, PyTorchModelHubMixin):
    def __init__(self, config):
        super(CustomModel, self).__init__()
        self.model = AutoModel.from_pretrained(config["base_model"])
        self.dropout = nn.Dropout(config["fc_dropout"])
        self.fc = nn.Linear(self.model.config.hidden_size, len(config["id2label"]))

    def forward(self, input_ids, attention_mask):
        features = self.model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        dropped = self.dropout(features)
        outputs = self.fc(dropped)
        return torch.softmax(outputs[:, 0, :], dim=1)

In [3]:
# Setup configuration and model
# device = "mps"
config = AutoConfig.from_pretrained("nvidia/domain-classifier")
tokenizer = AutoTokenizer.from_pretrained("nvidia/domain-classifier")
model = CustomModel.from_pretrained("nvidia/domain-classifier")
# model.to(device)
model.eval()

CustomModel(
  (model): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
              (dro

##### Sample Run

In [None]:
# Prepare and process inputs
text_samples = ["Sports is a popular domain", "Politics is a popular domain"]
inputs = tokenizer(text_samples, return_tensors="pt", 
                   padding="longest", truncation=True)
outputs = model(inputs["input_ids"], inputs["attention_mask"])

In [6]:
# Predict and display results
predicted_classes = torch.argmax(outputs, dim=1)
predicted_domains = [config.id2label[class_idx.item()] for class_idx in predicted_classes.cpu().numpy()]
print(predicted_domains)

['Sports', 'News']


##### Extracting domains

In [4]:
with open("../data/full_data_sampled_gpt2_with_subjects.json", "r") as f:
    dataset = json.load(f)

target_new = [row["target_new"].strip() for row in dataset]

In [5]:
def extract_domains(text_samples, batch_size):   
    outputs = [] 
    # Batch processing
    for i in tqdm(range(0, len(text_samples), batch_size)):
        batch = text_samples[i:i + batch_size]
        # Tokenize the batch
        inputs = tokenizer(batch, return_tensors="pt", 
                        padding="longest", truncation=True)
        # Process through the model
        output = model(inputs["input_ids"], attention_mask=inputs["attention_mask"])
        outputs.append(output)
        # print(f"Batch {i // batch_size + 1} outputs:", outputs)

    # Predict and display results
    outputs = torch.cat(outputs, dim=0)
    predicted_classes = torch.argmax(outputs, dim=1)
    predicted_domains = [config.id2label[class_idx.item()] for class_idx in predicted_classes.cpu().numpy()]
    # print(predicted_domains)

    return predicted_domains

In [7]:
# extract base prompts
text_samples = [row["base_prompt"] for row in dataset]

# define batch size
batch_size = 2

predicted_domains = extract_domains(text_samples, batch_size)

  1%|          | 46/5000 [00:18<51:01,  1.62it/s]  

: 

In [29]:
for txt, dom in zip(text_samples, predicted_domains):
    print(txt, dom, sep=" - ")

Toyota Camry XV30 is a product of - Autos_and_Vehicles
Chrysler RFE transmission, produced by - Autos_and_Vehicles
Seattle City Light is based in - Travel_and_Transportation
Chevrolet Constantia is produced by - Autos_and_Vehicles
Chrysler ecoVoyager, developed by - Autos_and_Vehicles
Toyota Sprinter Carib is produced by - Autos_and_Vehicles
Google Workspace, developed by - Computers_and_Electronics
Renault 18, created by - Autos_and_Vehicles
IBM 704, created by - Computers_and_Electronics
Intel Arc is owned by - Computers_and_Electronics
Airbus A318 is created by - Autos_and_Vehicles
Fiat Brevetti, developed by - Autos_and_Vehicles
The official language of South Africa is - Jobs_and_Education
Game Boy Color is produced by - Games
Renault Twingo, produced by - Autos_and_Vehicles
Masashi Kishimoto, a citizen of - Arts_and_Entertainment
Honda CB650SC is produced by - Autos_and_Vehicles
Isaac Newton works in the area of - Science
Metro Manila's capital, - Travel_and_Transportation
Koji Mu

In [None]:
len(predicted_domains), len(dataset)

In [None]:
for idx, row in enumerate(dataset):
    row["domain"] = predicted_domains[idx]

with open("../data/full_data_sampled_gpt2_with_domains.json", "w") as f:
    json.dump(dataset, f)