In [1]:
import os, sys
from threading import Thread
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
from transformers.generation.logits_process import LogitsProcessor, LogitsProcessorList

In [2]:
import pyarrow.parquet as pq
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer

In [3]:
from utils import log

In [4]:
torch.manual_seed(42)

<torch._C.Generator at 0x7f69bc7df2b0>

In [5]:
from dotenv import load_dotenv
load_dotenv(dotenv_path='mydata-chatbot.env')

True

In [6]:
class Retriever:
    def __init__(self):
        model_path = os.getenv("EMB_MODEL")
        log.info('Loading embedding model %s ... ' % model_path)
        self.model = SentenceTransformer(model_path)
        pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
        self.index = pc.Index(os.getenv('PINECONE_INDEX_NAME'))
        # self.texts = self.load_texts()
        self.df = pq.read_table(os.getenv('DATA_FILE')).to_pandas()

    def text_id(self, user_input, top_k=3):
        '''
        Returns a list of 2-tuples: [(id1, score1), (id2, score2), (id3, score3), ...]
        Element 1: ID.
        Element 2: Matching score.
        '''
        query = self.model.encode([user_input])[0].tolist()
        results = self.index.query(vector=query, top_k=top_k, include_values=True)['matches']
        return [(int(r['id'].split('-')[1]), r['score']) for r in results]

    def get_texts(self, user_input, top_k=3):
        text_ids = self.text_id(user_input, top_k)
        titles, texts = self.df['title'], self.df['text']
        return [(id, titles[id], texts[id], score) for id, score in text_ids]

In [7]:
retriever = Retriever()

INFO:utils.py:Loading embedding model /home/cdsw/models/all-mpnet-base-v2 ... 
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: /home/cdsw/models/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda


In [8]:
text_en = 'Do you have the revenue information of Cloudera in 2019?'
text_id = 'Berapa pendapatan Cloudera pada tahun 2019?'

In [9]:
docs = retriever.get_texts(text_en, top_k=3)
for id, title, text, score in docs:
    print(id, score, title)
    print(text[:100])
    print('=' * 30)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

64230 0.651945531 BRIEF-Seven Stars Cloud Group Sees FY 2017 Revenue $125 Mln To $144 Mln
February 23, 2018 / 2:05 PM / in 9 minutes BRIEF-Seven Stars Cloud Group Sees FY 2017 Revenue $125 M
68315 0.617973 BRIEF-Coima Res Profit For The Period At End-2017 Of EUR 28.9 Million
Feb 22 (Reuters) - Coima Res Spa Siiq:
* PROFIT FOR THE PERIOD AT END-2017 OF EUR 28.9 MILLION
* SAY
29373 0.615841389 Cegedim: Release of Full-Year 2017 Revenue
Press Release Full year Financial Information at December 31, 2017
IFRS - Regulated Information - No


In [10]:
print(texts[2][1], '\n')
texts[2][2][:616]

Cegedim: Release of Full-Year 2017 Revenue 



"Press Release Full year Financial Information at December 31, 2017\nIFRS - Regulated Information - Not Audited\nCegedim: organic growth accelerated in 2017\nRevenues grew 5.9% like for like over the full year Outlook for consolidated 2017 EBITDA raised significantly Cegelease business sold\nDisclaimer: This press release is available in French and in English. In the event of any difference between the two versions, the original French version takes precedence. This press release may contain inside information. It was sent to Cegedim's authorized distributor on January 29, 2017, no earlier than 5:45 pm Paris time."

In [None]:
results = retriever.index.query(vector=query, top_k=3, include_values=True)

In [None]:
[(r['id'], r['score']) for r in results['matches']]

In [12]:
def similar_with_mpnet(text1, text2):
    query = retriever.model.encode([text1])[0].tolist()
    doc = retriever.model.encode([text2])[0].tolist()
    t1 = torch.tensor([doc])
    t2 = torch.tensor([query])
    return F.cosine_similarity(t1, t2)

In [13]:
from transformers import AutoModel
contriever_path = '/home/cdsw/models/contriever-msmarco'
tokenizer = AutoTokenizer.from_pretrained(contriever_path)
model = AutoModel.from_pretrained(contriever_path)


def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings


def similar_with_contriever(text1, text2):
    inputs = tokenizer([text1, text2], padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    emb = mean_pooling(outputs[0], inputs['attention_mask'])
    # print(emb.shape)
    t1, t2 = emb.split(1, dim=0)
    # print(t1.shape, t2.shape)
    return F.cosine_similarity(t1, t2)

In [None]:
doc1 = 'Cloudera: Release of Full-Year 2019 Revenue'
doc1a = 'Cloudera: Release of Full-Year 2017 Revenue'
doc1b = 'Cegedim: Release of Full-Year 2017 Revenue'

doc2 = '''Press Release Full year Financial Information at December 31, 2017
IFRS - Regulated Information - Not Audited
Cloudera: organic growth accelerated in 2017
Revenues grew 5.9% like for like over the full year Outlook for consolidated 2017 EBITDA raised significantly Cloudera business sold
Disclaimer: This press release is available in French and in English. In the event of any difference between the two versions, the original French version takes precedence. This press release may contain inside information. It was sent to Cloudera's authorized distributor on January 29, 2017, no earlier than 5:45 pm Paris time.'''
doc2a = 'Press Release Full year Financial Information at December 31, 2017'
doc2b = 'IFRS - Regulated Information - Not Audited'
doc2c = 'Cloudera: organic growth accelerated in 2017'
doc2d = 'Revenues grew 5.9% like for like over the full year Outlook for consolidated 2017 EBITDA raised significantly Cloudera business sold'
doc2e = "Disclaimer: This press release is available in French and in English. In the event of any difference between the two versions, the original French version takes precedence. This press release may contain inside information. It was sent to Cloudera's authorized distributor on January 29, 2017, no earlier than 5:45 pm Paris time."

doc3a = 'Press Release Full year Financial Information at December 31, 2019'
doc3b = 'IFRS - Regulated Information - Not Audited'
doc3c = 'Cloudera: organic growth accelerated in 2019'
doc3d = 'Revenues grew 5.9% like for like over the full year Outlook for consolidated 2019 EBITDA raised significantly Cloudera business sold'
doc3e = "Disclaimer: This press release is available in French and in English. In the event of any difference between the two versions, the original French version takes precedence. This press release may contain inside information. It was sent to Cloudera's authorized distributor on January 29, 2019, no earlier than 5:45 pm Paris time."

doc4a = 'Press Release Full year Financial Information at December 31, 2017'
doc4b = 'IFRS - Regulated Information - Not Audited'
doc4c = 'Cegedim: organic growth accelerated in 2017'
doc4d = 'Revenues grew 5.9% like for like over the full year Outlook for consolidated 2017 EBITDA raised significantly Cegedim business sold'
doc4e = "Disclaimer: This press release is available in French and in English. In the event of any difference between the two versions, the original French version takes precedence. This press release may contain inside information. It was sent to Cegedim's authorized distributor on January 29, 2017, no earlier than 5:45 pm Paris time."

In [None]:
print(similar_with_mpnet(text_en, doc1))
print(similar_with_mpnet(text_en, doc1a))
print(similar_with_mpnet(text_en, doc1b))

print(similar_with_contriever(text_en, doc1))
print(similar_with_contriever(text_en, doc1a))
print(similar_with_contriever(text_en, doc1b))

In [None]:
print(similar_with_mpnet(text_en, doc2), similar_with_contriever(text_en, doc2))
print(similar_with_mpnet(text_en, doc2a), similar_with_contriever(text_en, doc2a))
print(doc2a)
print(similar_with_mpnet(text_en, doc2b), similar_with_contriever(text_en, doc2b))
print(doc2b)
print(similar_with_mpnet(text_en, doc2c), similar_with_contriever(text_en, doc2c))
print(doc2c)
print(similar_with_mpnet(text_en, doc2d), similar_with_contriever(text_en, doc2d))
print(doc2d)
print(similar_with_mpnet(text_en, doc2e), similar_with_contriever(text_en, doc2e))
print(doc2e)

In [None]:
print(similar_with_mpnet(text_en, doc3a), similar_with_contriever(text_en, doc3a))
print(doc3a)
print(similar_with_mpnet(text_en, doc3b), similar_with_contriever(text_en, doc3b))
print(doc3b)
print(similar_with_mpnet(text_en, doc3c), similar_with_contriever(text_en, doc3c))
print(doc3c)
print(similar_with_mpnet(text_en, doc3d), similar_with_contriever(text_en, doc3d))
print(doc3d)
print(similar_with_mpnet(text_en, doc3e), similar_with_contriever(text_en, doc3e))
print(doc3e)

In [None]:
print(similar_with_mpnet(text_en, doc4a), similar_with_contriever(text_en, doc4a))
print(doc4a)
print(similar_with_mpnet(text_en, doc4b), similar_with_contriever(text_en, doc4b))
print(doc4b)
print(similar_with_mpnet(text_en, doc4c), similar_with_contriever(text_en, doc4c))
print(doc4c)
print(similar_with_mpnet(text_en, doc4d), similar_with_contriever(text_en, doc4d))
print(doc4d)
print(similar_with_mpnet(text_en, doc4e), similar_with_contriever(text_en, doc4e))
print(doc4e)

In [None]:
sentences = [
    "Where was Marie Curie born?",
    "Maria Sklodowska, later known as Marie Curie, was born on November 7, 1867.",
    "Born in Paris on 15 May 1859, Pierre Curie was the son of Eugène Curie, a doctor of French Catholic origin from Alsace."
]

similar_with_contriever(sentences[0], sentences[1]), similar_with_contriever(sentences[0], sentences[2])

In [18]:
text_en = 'Do you have the revenue information of Nvidia in 2018?'
docs = retriever.get_texts(text_en, top_k=20)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [23]:
for doc_id, title, text, score in docs:
    print(similar_with_contriever(text_en, title), title)
    # scores = []
    for line in text.split('\n'):
        # scores += [similar_with_contriever(text_en, line)]
        score = similar_with_contriever(text_en, line)
        if score > 0.6:
            print(score, line)
    print()
    # break
        
    # scores = np.array(scores)
    # ind = scores.argsort()[-1]

tensor([0.7515], grad_fn=<SumBackward1>) Nvidia earnings Q4 2018
tensor([0.6266], grad_fn=<SumBackward1>) Nvidia shares jumped more than 10 percent on Thursday after the company reported better-than-expected fiscal earnings for the fourth quarter. 
tensor([0.6529], grad_fn=<SumBackward1>) Nvidia will hold a conference call to go over results with analysts at 5 p.m. Eastern time. Earnings: $1.78 per share, vs. $1.17 per share as expected by analysts, according to Thomson Reuters. Revenue: $2.91 billion, vs. $2.69 billion as expected by analysts, according to Thomson Reuters. 
tensor([0.7332], grad_fn=<SumBackward1>) Revenue for the quarter jumped 34 percent from a year earlier, and Nvidia's revenue of the full fiscal year of $9.71 billion was up 41 percent, according to a statement . 
tensor([0.6923], grad_fn=<SumBackward1>) Nvidia exceeded analysts' expectations in four of its five markets. The biggest category, gaming, produced $1.74 billion in revenue, above the FactSet estimate of $