## Install and Import Libraries

In [None]:
!pip install --upgrade openai


In [None]:
pip install --index-url https://download.pytorch.org/whl/nightly/cu118 --pre 'torch==2.1.0.dev20230703'

In [None]:
!pip install accelerate

In [None]:
!pip install transformers

In [None]:
!pip install pickle5

In [None]:
from openai import OpenAI

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer,AutoModel, pipeline
# import math
import torch
import re
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import pickle5 as pickle

In [None]:
OPENAI_API_KEY="Your_open_ai_key"
client = OpenAI(api_key = OPENAI_API_KEY)



# KG embeddings

In [None]:
# load graph object from file

G_ex = pickle.load(open("KG.pickle", 'rb'))

In [None]:
# Load model from HuggingFace Hub

tokenizer_e = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

model_e = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')


In [None]:
#Mean Pooling - Take attention mask into account for correct averaging

def mean_pooling(model_output, attention_mask):

    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
def get_embedding_nodes(G_ex):

    nodes = list(G_ex.nodes)
    node_embeddings = {}
    for node in nodes:

        encoded_input = tokenizer_e(node, padding=True, return_tensors='pt')
        with torch.no_grad():

            model_output = model_e(**encoded_input)
        # Calculate mean pooling
        sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
        node_embeddings[node] = sentence_embedding

    return node_embeddings

def get_embedding_query(query_entities):

    query_entities_embeddings = {}
    for entity in query_entities:

        encoded_input = tokenizer_e(entity, padding=True, return_tensors='pt')
        with torch.no_grad():
            model_output = model_e(**encoded_input)
        # Calculate mean pooling
        sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
        query_entities_embeddings[entity] = sentence_embedding

    return query_entities_embeddings

In [None]:
def calculate_similarity(query_entities_embeddings, node_embeddings):
    similarities = {}
    query_entities = query_entities_embeddings.keys()
    for entity in query_entities:
        query_embedding = query_entities_embeddings[entity]
        for node in node_embeddings:
            node_embedding = node_embeddings[node]
            similarity = cosine_similarity(query_embedding.reshape(1, -1), node_embedding.reshape(1, -1))[0][0]
            similarities[(entity, node)] = similarity
    return similarities

def retrieve_most_similar_nodes(query_entities_embeddings, node_embeddings, top_k=5):

    similarities = calculate_similarity(query_entities_embeddings, node_embeddings)
    sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    unique_nodes = set()
    for (entity, node), similarity in sorted_similarities:
        unique_nodes.add(node)
        if len(unique_nodes) >= top_k:
            break

    most_similar_nodes = list(unique_nodes)
    # most_similar_nodes = [node for (entity, node), similarity in sorted_similarities[:top_k]]
    return most_similar_nodes

def traverse_graph(G_ex, node, depth=2):
    triplets = []
    if depth == 0:
        return []
    neighbors = list(G_ex.neighbors(node))
    for neighbor in neighbors:
        edge_data = G_ex.get_edge_data(node, neighbor)
        triplet = (node, neighbor, edge_data)
        triplets.append(triplet)
        triplets.extend(traverse_graph(G_ex, neighbor, depth-1))
    return triplets

In [None]:
def organize_context_triplets(context_triplets):
    organized_triplets = {}
    for triplet in context_triplets:
        entity1, entity2, edge_data = triplet
        edge_title = list(edge_data.values())[0]["title"]
        if entity1 not in organized_triplets:
            organized_triplets[entity1] = []
        organized_triplets[entity1].append((entity2, edge_title))
    return organized_triplets

In [None]:
def organize_context_triplets(context_triplets):
    organized_triplets = []
    for triplet in context_triplets:
        entity1, entity2, edge_data = triplet
        edge_title = list(edge_data.values())[0]["title"]
        organized_triplets.append((entity1, entity2, edge_title))
    return organized_triplets



In [None]:
node_embeddings = get_embedding_nodes(G_ex)

In [None]:
query_entities = extract_entities_from_query(query)

In [None]:
query_entities_embeddings = get_embedding_query(query_entities)

In [None]:
query_entities_embeddings['Einstein die'].shape

In [None]:
# Step 3: Retrieve most similar nodes
most_similar_nodes = retrieve_most_similar_nodes(query_entities_embeddings, node_embeddings, top_k=3)

In [None]:
context_triplets = []
for node in most_similar_nodes:
    context_triplets.extend(traverse_graph(G_ex, node, depth=2))

In [None]:
# Organize context triplets
organized_triplets = organize_context_triplets(context_triplets)

In [None]:
len(organized_triplets)

# Prompt preparation

In [None]:
def divide_into_chunks(lst, chunk_size):

    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]


chunk_size = 100
result = divide_into_chunks(organized_triplets, chunk_size)
len(result)

def strings_response(text):
    index = text.find("<|assistant|>")
    if index != -1:
        return text[index + len("<|assistant|>"):]
    else:
        return ""


result = strings_response(text)
print(result)





In [None]:
def triplets_to_sentences(organized_triplets):
    sentences = []
    for triplet in organized_triplets:
        sentence = f"{triplet[0]} {triplet[2]} {triplet[1]}"
        sentences.append(sentence)
    return sentences

In [None]:
sentence_embedder(query).shape

In [None]:
def sentence_embedder(sentence) :
    encoded_input = tokenizer_e(sentence, padding=True, return_tensors='pt')
    with torch.no_grad():

        model_output = model_e(**encoded_input)
        # Calculate mean pooling
        sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embedding


def sentences_embeddings(sentences):
    sentence_embeddings = {}
    for sentence in sentences :
        sentence_embedding = sentence_embedder(sentence)
        sentence_embeddings[sentence] = sentence_embedding

    return sentence_embeddings

def most_similar_sentences(sentences, query, top_k = 50):
    query_embedding = sentence_embedder(query)
    sentence_embeddings = sentences_embeddings(sentences)

    similarities = {}
    for sentence, embedding in sentence_embeddings.items():
        similarity = cosine_similarity(embedding.numpy(), query_embedding.numpy())
        similarities[sentence] = similarity

    sorted_sentences = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    sorted_sentences =  [u for u,v in sorted_sentences]
    return sorted_sentences[:top_k ]




In [None]:
query = "tell something about data science and albert einstein"

In [None]:
def get_entity_from_query(query) :
    prompt_content = "Given a text, extract entities that one can use to search for node entities in an already existing knowledge graph, your only return should be a list of entities contained in the query, formatted as a string representation of a list like: entity1|entity2|... "


    messages = [
        {
            "role": "system",
            "content": prompt_content,
        },
        {"role": "user", "content": query},
    ]
    SYS_PROMPT = prompt_content
    response = client.chat.completions.create(
                  model="gpt-4-0125-preview",
                  messages = [
                          {
                              "role": "system",
                              "content": SYS_PROMPT,
                          },
                          {
                              "role": "user",
                              "content": query
                          }
                      ]
                )
    result = response.choices[0].message.content
    entities = result.split('|')
    return entities

In [None]:
entities = get_entity_from_query(query)
entities

['data science', 'Albert Einstein']

In [None]:
prompt_content = "respond to the queries of the user"
content = "Give me a list containing visually appealing colors names in python "

In [None]:
def answer_client(query):

    query_entities = get_entity_from_query(query)

    query_entities_embeddings = get_embedding_query(query_entities)

    # Step 3: Retrieve most similar nodes
    most_similar_nodes = retrieve_most_similar_nodes(query_entities_embeddings, node_embeddings, top_k=3)

    context_triplets = []
    for node in most_similar_nodes:
        context_triplets.extend(traverse_graph(G_ex, node, depth=2))

    # Organize context triplets
    organized_triplets = organize_context_triplets(context_triplets)

    sentences = triplets_to_sentences(organized_triplets)
    sentences = most_similar_sentences(sentences, query, top_k = 50)

    # FINAL PROMPT

    prompt_content = (
    "Welcome! You are an AI chatbot designed to provide answers based on the provided context. "
    "The context consists of sentences generated from a knowledge graph that came from notes of the user. "
    "Think of this context as your second brain, a repository of information to draw from when answering questions. "
    "Your task is to answer user queries based on this context if they are needed in the answer.\n\n"
    "Here is your second brain, containing the accumulated knowledge:\n\n"
    f"{sentences}\n\n"
    "Feel free to start answering user queries."
    )


    messages = [
        {
            "role": "system",
            "content": prompt_content,
        },
        {"role": "user", "content": query},
    ]
    SYS_PROMPT = prompt_content
    response = client.chat.completions.create(
                  model="gpt-4-0125-preview",
                  messages = [
                          {
                              "role": "system",
                              "content": SYS_PROMPT,
                          },
                          {
                              "role": "user",
                              "content": query
                          }
                      ]
                )
    result = response.choices[0].message.content

    return result





In [None]:
#example0
st = time.time()
query = "Who are you ?"
response = answer_client(query)
et = time.time()

In [None]:
response

'I am an AI chatbot designed to assist you by providing answers based on a specific set of information given to me, which I refer to as my "second brain". This includes various facts and connections across different topics, such as foundations of data science, measures of central tendency, Einstein\'s mass-energy equivalence formula, and more. I\'m here to help answer your questions using this information.'

In [None]:
#example1
st = time.time()
query = "When did Albert Einstein die ?"
response = answer_client(query)
et = time.time()

In [None]:
print("Time elapsed",et - st)
print("Response : \n",response)

Time elapsed 12.647762537002563
Response : 
 Albert Einstein died on 1955-04-18.


In [None]:
# example2
st = time.time()
query = "Who wrote the cosmological paper"
response = answer_client(query)
et = time.time()

In [None]:
print("Time elapsed",et - st)
print("Response : \n",response)

Time elapsed 5.546705722808838
Response : 
 Albert Einstein wrote the cosmological paper. This is indicated by the multiple references to Einstein's contributions to the field, particularly his work on the general theory of relativity and its implications for the modeling of the structure and evolution of the universe. His influential work in 1905, often referred to as his "annus mirabilis" or miracle year, laid significant groundwork for modern cosmological theories and papers.


In [None]:
# example3
st = time.time()
query = "What open source libraries have i seen "
response = answer_client(query)
et = time.time()

In [None]:
print("Time elapsed",et - st)
print("Response : \n",response)

Time elapsed 8.505751848220825
Response : 
 Based on your notes, you've mentioned several open-source libraries, particularly in the context of data visualization. You have referenced the following open-source libraries:

- Matplotlib
- Seaborn
- Plotly
- ggplot

These libraries are related to open-source and open-source software libraries, and they provide rich functionalities for data visualization in Python.



#extract_entities_without_openAI

In [None]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

In [None]:
# from https://huggingface.co/Babelscape/rebel-large
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

In [None]:
# knowledge base class
class KB():
    def __init__(self):
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def add_relation(self, r):
        if not self.exists_relation(r):
            self.relations.append(r)

    def print(self):
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

In [None]:
# build a knowledge base from text
def from_small_text_to_kb(text, verbose=False):
    kb = KB()

    # Tokenizer text
    model_inputs = tokenizer(text, max_length=512, padding=True, truncation=True,
                            return_tensors='pt')
    if verbose:
        print(f"Num tokens: {len(model_inputs['input_ids'][0])}")

    # Generate
    gen_kwargs = {
        "max_length": 216,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": 3
    }
    generated_tokens = model.generate(
        **model_inputs,
        **gen_kwargs,
    )
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

    # create kb
    for sentence_pred in decoded_preds:
        relations = extract_relations_from_model_output(sentence_pred)
        for r in relations:
            kb.add_relation(r)

    return kb

In [None]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

text = "When did Albert Einstein die?"
filtered_text = remove_stopwords(text)
print(filtered_text)


In [None]:
def extract_entities_from_query(query):
  filtered_text = remove_stopwords(text)
  kb = from_small_text_to_kb(filtered_text, verbose=True)
  entities = []
  for relation in kb.relations:
      entities.extend(relation['head'].split())  # Split the head entity into individual words
      entities.extend(relation['tail'].split())  # Split the tail entity into individual words

  entities = set(filter(lambda x: x.strip(), entities))

  return entities


In [None]:
extract_entities_from_query(query)

In [None]:
def extract_entities_from_query(query):
  entities = []
  entity_set = set()
  filtered_text = remove_stopwords(query)
  kb = from_small_text_to_kb(filtered_text, verbose=True)

  for relation in kb.relations:
      head = relation['head']
      tail = relation['tail']
      # Split the entities into individual words
      head_words = set(head.split())
      tail_words = set(tail.split())

      # Calculate intersection and union of words
      intersection = head_words.intersection(tail_words)
      union = head_words.union(tail_words)

      # Calculate IOU score
      iou_score = len(intersection) / len(union)

      # Add entity to the set if IOU score is below a certain threshold (e.g., 0.5)
      if iou_score < 0.5:
          entity_set.add(head)
          entity_set.add(tail)

  # Convert set to list
  entities = list(entity_set)

  return entities


In [None]:
extract_entities_from_query(query)

#Prompt_zephyr

In [None]:
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [None]:
# Prepare the prompt content with introduction and context
prompt_content = (
    "Introduction:\n\n"
    "You are a chatbot designed to generate responses based on context from a knowledge graph. "
    "The context is provided in the form of triplets (entity1, entity2, relation), representing "
    "relationships between entities in the knowledge graph.\n\n"
    "Context:\n\n"
)
for idx, triplet in enumerate(organized_triplets, start=1):
    entity1, entity2, edge = triplet
    prompt_content += f"{idx}. ({entity1}, {entity2}, {edge})\n"
prompt_content += "\nGiven this context, respond to the following query:\n\n"
prompt_content += query


In [None]:
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [None]:
#zephyr
# Generate response
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(strings_response(outputs[0]["generated_text"]))

In [None]:
result

'With the help of your second brain, Albert Einstein died on 1955-04-18.'