<a href="https://colab.research.google.com/github/abaranguer/lab-aina/blob/main/xatbot_catal%C3%A0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# XatBot Català - GPT-J + AINA

In [None]:
!pip install ctranslate2 pyonmttok

In [None]:
import ctranslate2
import pyonmttok
from huggingface_hub import snapshot_download
from transformers import GPTJForCausalLM, AutoTokenizer
import torch

In [None]:
# AINA
class Translators:
  def __init__(self):
    model_dir_en_ca = snapshot_download(repo_id="projecte-aina/aina-translator-en-ca", revision="main", local_dir="/content/aina/en-ca")
    model_dir_ca_en = snapshot_download(repo_id="projecte-aina/aina-translator-ca-en", revision="main", local_dir="/content/aina/ca-en")

    self.tokenizerEnCa = pyonmttok.Tokenizer(mode="none", sp_model_path = model_dir_en_ca + "/spm.model")
    self.tokenizerCaEn = pyonmttok.Tokenizer(mode="none", sp_model_path = model_dir_ca_en + "/spm.model")

    self.tranlatorCa2En = ctranslate2.Translator(model_dir_ca_en)
    self.tranlatorEn2Ca = ctranslate2.Translator(model_dir_en_ca)

  def translateCa2En(self, text):
    tokenized = self.tokenizerCaEn.tokenize(text)
    translated = self.tranlatorCa2En.translate_batch([tokenized[0]])
    return self.tokenizerCaEn.detokenize(translated[0][0]['tokens'])

  def translateEn2Ca(self, text):
    tokenized = self.tokenizerEnCa.tokenize(text)
    translated = self.tranlatorEn2Ca.translate_batch([tokenized[0]])
    return self.tokenizerEnCa.detokenize(translated[0][0]['tokens'])

In [None]:
# GPT-J
class XatBotGPTJ_AINA:
  def __init__(self, isLocal):
    self.translators = Translators()
    self.device = "cuda"

    modelName = "EleutherAI/gpt-j-6B"
    model_save_dir = "/content/gpt-j-6B" # Define model_save_dir here

    self.model = None

    if not isLocal:
      # Load the model
      self.model = GPTJForCausalLM.from_pretrained(
        modelName,
        revision="float16",
        torch_dtype=torch.float16,
      )
      # Save the model to a local directory
      self.model.save_pretrained(model_save_dir)
      self.tokenizer = AutoTokenizer.from_pretrained(modelName)
      self.tokenizer.save_pretrained(model_save_dir)
    else:
      # Load the model from the local directory
      self.model = GPTJForCausalLM.from_pretrained(model_save_dir).to(self.device)
      # self.tokenizer = AutoTokenizer.from_pretrained(model_save_dir)
      self.tokenizer = AutoTokenizer.from_pretrained(model_save_dir)

  def xatBotSession(self):
    print("Hola, sóc el Xatbot GptJ-Aina.")
    print("Per a acabar la sessió, escriu 'Adéu'.")
    print("Parlem del que vulguis.")

    follow = True
    while follow:
      queryCat = input()
      if queryCat.upper() == "ADÉU":
        print("Ha estat un plaer parlar amb tu. Adéu!")
        follow = False
      else:
        query = self.translators.translateCa2En(queryCat)
        answerEn = self.getAnswer(query)
        answer = self.translators.translateEn2Ca(answerEn)
        print("XatBot:", answer)

  def getAnswer(self, query):
    input_ids = self.tokenizer("Q: " + query + "\nA:",return_tensors="pt").input_ids.to(self.device)

    gen_tokens = self.model.generate(
        input_ids,
        do_sample=True,
        temperature=0.9,
        max_length=1024)

    queryAndAnswer = self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0]
    answerAndQuery = queryAndAnswer.split("A:")[1].strip()
    answer = answerAndQuery.split("Q:")[0].strip()

    return answer

In [None]:
if __name__ == '__main__':
  xatBot = XatBotGPTJ_AINA(True)
  xatBot.xatBotSession()