In [1]:
import os, sys
from threading import Thread
from dotenv import load_dotenv
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
from transformers.generation.logits_process import LogitsProcessor, LogitsProcessorList

In [2]:
torch.manual_seed(42)

<torch._C.Generator at 0x7ff0fdc6f410>

In [3]:
from utils import log, prompt_template

In [5]:
# del Retriever
from contriever import Retriever

In [6]:
load_dotenv(dotenv_path='mydata-chatbot.env')

True

In [7]:
class EosLogitsProcessor(LogitsProcessor):
    def __init__(self):
        self.stop_rate = None
        self.newline_id = 13
        self.eos_id = 2

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        if scores[0].argmax().item() != self.newline_id and self.stop_rate is None:
            self.stop_rate = 1.0
        if self.stop_rate is not None:
            if scores[0].argmax().item() == self.newline_id:
                self.stop_rate *= 1.1
            scores[:, self.eos_id] = scores[:, self.eos_id] * self.stop_rate

        return scores

In [8]:
class CudaModel:
    def __init__(self, model_path):
        self.model_path = model_path
        self.model, self.tokenizer = self.load_model()
        self.logits_processor = LogitsProcessorList()
        self.logits_processor.append(EosLogitsProcessor())

    def load_model(self):
        tokenizer = AutoTokenizer.from_pretrained(self.model_path, model_max_length=8192)
        tokenizer.pad_token = tokenizer.eos_token

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=False,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        print('Loading model ...')
        model = AutoModelForCausalLM.from_pretrained(
            self.model_path,
            quantization_config=bnb_config,
            use_cache=True,
            # do_sample=True,
            device_map='auto'
        )
        return model, tokenizer

    def gen_output(self, input, text):
        prompt = prompt_template % (text, input)
        log.info('Tokenizing ...')
        input_ids = self.tokenizer([prompt], return_tensors="pt", truncation=True).to('cuda')
        streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True)
        generation_kwargs = dict(
            input_ids,
            streamer=streamer,
            logits_processor=self.logits_processor,
            max_new_tokens=500,
            do_sample=False,
            # top_p=0.9,
            # temperature=0.1
        )
        thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
        thread.start()
        for outp in streamer:
            yield outp

In [9]:
model = CudaModel(os.getenv('CHAT_MODEL'))

Loading model ...


INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
retriever = Retriever()

INFO:utils.py:Loading embedding model /home/cdsw/models/all-mpnet-base-v2 ... 
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: /home/cdsw/models/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda


In [11]:
text_en = 'Do you have the revenue information of Cloudera in 2019?'
text_id = 'Berapa pendapatan Cloudera pada tahun 2019?'

In [12]:
texts = retriever.get_texts(text_en, top_k=3)
for id, title, text, score in texts:
    print(id, score, title)
    print(text[:100])
    print('=' * 30)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

64230 0.651945531 BRIEF-Seven Stars Cloud Group Sees FY 2017 Revenue $125 Mln To $144 Mln
February 23, 2018 / 2:05 PM / in 9 minutes BRIEF-Seven Stars Cloud Group Sees FY 2017 Revenue $125 M
68315 0.617973 BRIEF-Coima Res Profit For The Period At End-2017 Of EUR 28.9 Million
Feb 22 (Reuters) - Coima Res Spa Siiq:
* PROFIT FOR THE PERIOD AT END-2017 OF EUR 28.9 MILLION
* SAY
29373 0.615841448 Cegedim: Release of Full-Year 2017 Revenue
Press Release Full year Financial Information at December 31, 2017
IFRS - Regulated Information - No


In [39]:
print(texts[2][1], '\n')
texts[2][2][:616]

Cegedim: Release of Full-Year 2017 Revenue 



"Press Release Full year Financial Information at December 31, 2017\nIFRS - Regulated Information - Not Audited\nCegedim: organic growth accelerated in 2017\nRevenues grew 5.9% like for like over the full year Outlook for consolidated 2017 EBITDA raised significantly Cegelease business sold\nDisclaimer: This press release is available in French and in English. In the event of any difference between the two versions, the original French version takes precedence. This press release may contain inside information. It was sent to Cegedim's authorized distributor on January 29, 2017, no earlier than 5:45 pm Paris time."

In [22]:
query = retriever.model.encode([text_en])[0].tolist()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
results = retriever.index.query(vector=query, top_k=3, include_values=True)

In [17]:
[(r['id'], r['score']) for r in results['matches']]

[('news-64230', 0.651945531),
 ('news-68315', 0.617973),
 ('news-29373', 0.615841448)]

In [18]:
import torch.nn.functional as F

In [43]:
query = retriever.model.encode([text_en])[0].tolist()
doc = 'Cloudera: Release of Full-Year 2019 Revenue'
doc = "Press Release Full year Financial Information at December 31, 2017\nIFRS - Regulated Information - Not Audited\nCloudera: organic growth accelerated in 2017\nRevenues grew 5.9% like for like over the full year Outlook for consolidated 2017 EBITDA raised significantly Cloudera business sold\nDisclaimer: This press release is available in French and in English. In the event of any difference between the two versions, the original French version takes precedence. This press release may contain inside information. It was sent to Cloudera's authorized distributor on January 29, 2017, no earlier than 5:45 pm Paris time."
doc = retriever.model.encode([doc])[0].tolist()
# t1 = torch.tensor([results['matches'][0]['values']])
t1 = torch.tensor([doc])
t2 = torch.tensor([query])
F.cosine_similarity(t1, t2)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

tensor([0.6778])