In [1]:
from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch 

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [7]:
class CustomLLM(LLM):
    
    model = 4
    tokenizer = 3
    def __init__(self):
        super(CustomLLM, self).__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)
        self.tokenizer = T5Tokenizer.from_pretrained("t5-base")
    
    @property
    def _llm_type(self) -> str:
        return "custom"
    
    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        if stop is not None:
            raise ValueError("stop kwargs are not permitted.")
            
        tokens = self.tokenizer.encode(prompt, return_tensors="pt")
        tokens = tokens.to(device)
        outputs = self.model.generate(tokens)
        out = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        return out
    


In [None]:
class CustomLLM(LLM):
    
    model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    
    #model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", cache_dir="kuacc/users/bozyurt20/.cache/huggingface/hub").parallelize()
    #tokenizer = AutoTokenizer.from_pretrained("bigscience/T0pp")
    
    @property
    def _llm_type(self) -> str:
        return "custom"
    
    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        if stop is not None:
            raise ValueError("stop kwargs are not permitted.")
            
        tokens = self.tokenizer.encode(prompt, return_tensors="pt")
        tokens = tokens.to(device)
        outputs = self.model.generate(tokens)
        out = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        return out
    


In [8]:
llm = CustomLLM()

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [9]:
llm("When did the French Revolution happen?")



'Quand a eu lieu la révolution française?'