<a href="https://colab.research.google.com/github/Yagth/SemanticSearch/blob/main/Copy_of_Slack_Semantic_Search_(Instructor_Embeddings_%2B_Chroma_DB_%2B_LLaMA2_7B).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementing Semantic Search on Exported Slack Data

This notebook contains an implementation of semantic search on exported slack data

*   *This notebook utilizes the following tools:*

>

    1.   'Pandas' - to load and extract relevant information from the exported data
    2.   'Instructor embedding model' - to generate embeddings for each message
    3.   'Chroma' - to store and query the vector embeddings along with some metadata
    4.   'LLaMA2 7B model' - to present the results in natural language


## 1. Installing dependencies

In [None]:
# Install gitpython to clone a github repo containing the exported slack data
!pip install gitpython

Collecting gitpython
  Downloading GitPython-3.1.32-py3-none-any.whl (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gitdb<5,>=4.0.1 (from gitpython)
  Downloading gitdb-4.0.10-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython)
  Downloading smmap-5.0.0-py3-none-any.whl (24 kB)
Installing collected packages: smmap, gitdb, gitpython
Successfully installed gitdb-4.0.10 gitpython-3.1.32 smmap-5.0.0


In [None]:

#Install dependencies for Llama 2
!pip -q install git+https://github.com/huggingface/transformers # need to install transformers from github
!pip install -q datasets loralib sentencepiece
!pip -q install bitsandbytes accelerate xformers einops

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.4/492.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━

In [None]:
# Install Instructor Embedding and Chroma
!pip -q install langchain chromadb sentence_transformers InstructorEmbedding

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m399.0/399.0 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.6/62.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.4/58.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Install ngrok to host an api endpoint from colab
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-6.0.0.tar.gz (681 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m681.2/681.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyngrok
  Building wheel for pyngrok (setup.py) ... [?25l[?25hdone
  Created wheel for pyngrok: filename=pyngrok-6.0.0-py3-none-any.whl size=19866 sha256=1da7f1d870f1a9c44ba101957298a88ff3dd8eb0478b6c54288c7344ea4d58ad
  Stored in directory: /root/.cache/pip/wheels/5c/42/78/0c3d438d7f5730451a25f7ac6cbf4391759d22a67576ed7c2c
Successfully built pyngrok
Installing collected packages: pyngrok
Successfully installed pyngrok-6.0.0


In [None]:
# Install flask cors to enable cors for all domains.
!pip install -U flask-cors

## 2. Fetching the slack data

In [None]:
# The path where the exported slack data is stored in local storage
slack_data_path = "/content/slackdata/"

Run this if you haven't already cloned the GitHub repository

In [None]:
import git

repo_url = "https://github.com/TollanBerhanu/MatterMost-LLM-test-Slack-export-Jun-19-2023---Jun-20-2023.git"
# slack_data_path = '/content/drive/MyDrive/Colab Notebooks/dataset/slack-data/'

git.Repo.clone_from(repo_url, slack_data_path)

<git.repo.base.Repo '/content/slackdata/.git'>

In [None]:
import os
import pandas as pd

def get_all_channels(path):
  df = pd.read_json(path + 'channels.json')

  channel_ids = [id for id in df['id']]
  channel_names = [ name for name in df['name']]

  return pd.DataFrame({ 'channel_id': channel_ids, 'channel_name': channel_names } )

channels = get_all_channels(slack_data_path)
channels

Unnamed: 0,channel_id,channel_name
0,C05D1SE01B7,random
1,C05D77W3N76,general
2,C05D7863DRA,test
3,C05ABCDE01,gptgenerated


In [None]:
import glob
import json

# Return the metadata of each message in the channel
def extract_channel_metadata(path, channel_name):

  daily_json_files = glob.glob(path + channel_name +'/*.json')  # use glob to get all the json files in the folder

  if not daily_json_files:  # return if the channel doesn't exist (or hasn't been exported yet)
    return

  metadata = pd.DataFrame(columns = ['message', 'channel', 'date', 'time', 'user_id', 'user_name'])

  # loop over the list of json files (each json file includes every message in that channel for a single day)
  for f in daily_json_files:
    with open(f, 'r') as file:  # open the daily json file
        data = file.read()  # Read the contents
        today_data = json.loads(data) # Parse the JSON data

    today_date = f.split("/")[-1]  # 'f' is the full file path and file name
    print('Extracting...', today_date) # the file name is the date

    # iterate through all the messages of the day
    for msg_data in today_data:
      # Skip if its a "channel_join" type message or if the actual message content is empty
      if ('subtype' in msg_data) or (msg_data['text'] == "") or (msg_data['type'] != 'message'):
        continue
        # TODO: filter out any links, stickers, and other junk
        # TODO: replace @Member references with their real names

      metadata.loc[len(metadata)] = {
            'message': msg_data['user_profile']['first_name'] + ': ' + msg_data['text'],
            'channel': channel_name,
            'date': today_date.split(".json")[0], # omit the file extension '.json'
            'time': msg_data['ts'],
            'user_id': msg_data['user'],
            'user_name': msg_data['user_profile']['real_name'] # We can use 'first_name' to get the first name and 'real_name' to get the full name of the user
      }

  return metadata

# extract_channel_metadata(slack_data_path, 'test')

## 3. Generating Embeddings

In [None]:
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

  from tqdm.autonotebook import trange


In [None]:
# Load the embedding model
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
                                                      model_kwargs={"device": "cuda"})

Downloading (…)7f436/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

Downloading (…)0daf57f436/README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

Downloading (…)af57f436/config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)7f436/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading (…)f57f436/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


In [None]:
# Get the list of embeddings for all messages in a channel
def embed_channel_messages(channel_messages):
  msg_list = channel_messages.astype(str).tolist()
  return instructor_embeddings.embed_documents(msg_list)

## 4. Storing the embeddings in Chroma DB


In [None]:
import chromadb

In [None]:
client = chromadb.PersistentClient(path="/content/chroma_db")

In [None]:
# Get a collection object from an existing collection, by name. If it doesn't exist, create one.
collection = client.get_or_create_collection(
      name= "slack_collection",
      metadata= {"hnsw:space": "cosine"},
      # embedding_function= instructor_embeddings       # The default embedding model is 'all-MiniLM-L6-V2'
    )

In [None]:
# Warning: Delete a collection and all associated embeddings, documents, and metadata. ⚠️ This is destructive and not reversible :(
# client.delete_collection(name="slack_collection")

In [None]:
def upsert_channel_embeddings(channel_name, channel_embeddings, channel_metadata):

  # parse the channel metadata to json
  parsed_channel_metadata = json.loads(channel_metadata.to_json(orient="records"))

  # create IDs for the embeddings ... [channelname_0 -> ... -> channelname_..]
  ids = [ (channel_name + str(ch)) for ch in enumerate(channel_embeddings) ]

  # upsert the embeddings along with their metadata, into a Chroma collection
  collection.upsert(
    ids = ids,
    embeddings = channel_embeddings,
    metadatas = parsed_channel_metadata,
    # documents = channel_metadata['channel'].astype(str).tolist()
  )

  print(collection.peek()) # returns a list of the first 10 items in the collection
  print(collection.count()) # returns the number of items in the collection

In [None]:
# Upsert every channel's data into the vector db
def upsert_all_channels():
  channel_names = channels['channel_name'].tolist()
  for channel_name in channel_names:
    print('Upserting ' + channel_name + ' ... ')

    channel_metadata = extract_channel_metadata(slack_data_path, channel_name)

    if (not channel_metadata.empty):

      channel_embeddings = embed_channel_messages(channel_metadata['message'])

      upsert_channel_embeddings(channel_name, channel_embeddings, channel_metadata)

# Upsert just one channel's data
def upsert_one_channel(channel_name):
  print('Upserting ' + channel_name + ' ... ')

  channel_metadata = extract_channel_metadata(slack_data_path, channel_name)

  if (not channel_metadata.empty):

    channel_embeddings = embed_channel_messages(channel_metadata['message'])

    upsert_channel_embeddings(channel_name, channel_embeddings, channel_metadata)

In [None]:
upsert_all_channels()
# upsert_one_channel('random')  # general, random, gptgenerated

Upserting random ... 
Extracting... 2023-06-19.json
{'ids': ['random(0, [0.0017252354882657528, 0.040115442126989365, 0.0002869960153475404, -0.06856098771095276, 0.006918115075677633, -0.05214540287852287, -0.04898548498749733, 0.018706267699599266, -0.05557114630937576, -0.0010259138653054833, 0.03610115870833397, 0.032283391803503036, -0.02439330890774727, -0.06376130133867264, 0.07873164117336273, -0.04071155562996864, 0.005797932855784893, -0.035491809248924255, -0.009002001024782658, 0.0037501154001802206, 0.011249453760683537, 0.019387975335121155, -0.03564482182264328, -0.04927555099129677, -0.02614939957857132, -0.02608407288789749, 0.009292623959481716, -0.01854153908789158, -0.0021517120767384768, -0.0018777212826535106, 0.051978643983602524, 0.009742006659507751, 0.03750687465071678, -0.05334830656647682, 0.06791172176599503, -0.026460129767656326, 0.027553599327802658, -0.01604863815009594, 0.02472526766359806, -0.013731139712035656, -0.06403998285531998, 0.076018750667572

In [None]:
# Load the persisted database from disk, and use it as normal.
# vectordb = Chroma(persist_directory= '/content/chroma_db')

## 5. Querying the messages from Chroma

In [None]:
def get_data_from_chroma(query):
  # Generate embeddings for the query
  embedded_query = instructor_embeddings.embed_query(query)

  query_response = collection.query(
      query_embeddings = embedded_query,
      n_results = 5,
      # where = {"metadata_field": "is_equal_to_this"},
      where = {
          # "channel": {"$eq": "general"}
          # "user_id": {"$in": ["U05D1SQDNSH", "U05DHDPL4FK", "U05CQ93C3FZ", "U05D4M7RGQ3"]}
      }
      # where_document={"$contains":"search_string"}
  )

  # documents = query_response['documents']
  scores = query_response['distances'][0]
  metadatas = query_response['metadatas'][0]

  context = ''

  for idx, metadata in enumerate(query_response['metadatas'][0]):
    context += metadata['message'] + '\n'
    metadata['score'] = 1 - scores[idx]

  return {'context': context, 'metadata': metadatas}

# get_data_from_chroma("Why was it good work?")
get_data_from_chroma("What did Tollan say was good work?")
# get_data_from_chroma("What are some models that are comparable to GPT 3?")

{'context': "Tollan: Good work.. now we don't have to worry about exporting data.\nTollan: Also, welcome <@U05D4M7RGQ3>\nTollan: It's best if we just post random topics here to test the semantic search.\nTollan: And we should also post some stickers... :grinning: :smile: :grin:\nAlice: I agree with John. BERT has a strong track record and has been extensively tested in various applications.\n",
 'metadata': [{'channel': 'general',
   'date': '2023-06-19',
   'message': "Tollan: Good work.. now we don't have to worry about exporting data.",
   'time': '1687166814.786429',
   'user_id': 'U05CQ93C3FZ',
   'user_name': 'Tollan',
   'score': 0.7589913989888546},
  {'channel': 'general',
   'date': '2023-06-19',
   'message': 'Tollan: Also, welcome <@U05D4M7RGQ3>',
   'time': '1687169174.199539',
   'user_id': 'U05CQ93C3FZ',
   'user_name': 'Tollan',
   'score': 0.7190510819354768},
  {'channel': 'general',
   'date': '2023-06-19',
   'message': "Tollan: It's best if we just post random topi

## 6. Getting the response from LLaMA 2 - 7B

In [None]:
!nvidia-smi

Sat Jul 29 18:16:30 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    74W /  70W |   6307MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                          use_auth_token=True,)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                             device_map='auto',
                                             torch_dtype=torch.float16,
                                             use_auth_token=True,
                                            #  load_in_8bit=True,
                                             load_in_4bit=True
                                             )



OSError: ignored

In [None]:
# Use a pipeline for later
from transformers import pipeline

pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.float16,
                device_map="auto",
                do_sample=True,
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )

In [None]:
import json
import textwrap

'''
  You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful,
  unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

  If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question,
  please don't share false information. Don't provide any information you weren't asked to provide.
'''

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
  You will be given a sequence of chat messages related to a certain topic. Write a response that answers the question based on what is discussed in the chat messages.
  Do not mention anything outside of what is discussed. Don't answer anything outside the context you are provided.
  If there isn't enough context, simply reply "This topic was not discussed previously"
  """

SYSTEM_PROMPT = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS

def get_prompt(instruction):
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

## Helper function to format the response
def cut_off_text(text, prompt):
    cutoff_phrase = prompt
    index = text.find(cutoff_phrase)
    if index != -1:
        return text[:index]
    else:
        return text

def remove_substring(string, substring):
    return string.replace(substring, "")

def generate(text):
    prompt = get_prompt(text)
    with torch.autocast('cuda', dtype=torch.float16):
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
        outputs = model.generate(**inputs,
                                 max_new_tokens=512,
                                 eos_token_id=tokenizer.eos_token_id,
                                 pad_token_id=tokenizer.eos_token_id,
                                 )
        final_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        final_outputs = cut_off_text(final_outputs, '</s>')
        final_outputs = remove_substring(final_outputs, prompt)

    return final_outputs#, outputs

def parse_text(text):
        wrapped_text = textwrap.fill(text, width=100)
        return wrapped_text


In [None]:
# Format the prompt to include the query and some context
def format_prompt(query, context):
  return '''
    ### Messages:
    {context}

    ### Question:
    {query}
    '''.format(context=context, query=query)

In [None]:
def semantic_search(query):
  data = get_data_from_chroma(query)

  context = data['context']
  metadata = data['metadata']

  prompt = format_prompt(query, context)
  generated_text = generate(prompt)
  parsed_text = parse_text(generated_text)

  return {'response': parsed_text, 'metadata': metadata}

In [None]:
%%time
# semantic_search("What are some models that are comparable to GPT 3?")
# semantic_search("How can I make some pancakes?")
semantic_search("What did Tollan say about semantic search?")

## 7. Creating an API Endpoint

In [None]:
from flask import Flask, request
from flask_cors import CORS
from pyngrok import ngrok

port_no = 5000

app = Flask(__name__)
CORS(app)

ngrok.set_auth_token("2S3sOVlp9tBJmDvErbUdCPoE2ri_3yrw7sVk12An4TZieirkT")
public_url =  ngrok.connect(port_no).public_url

@app.route("/", methods=['GET', 'POST'])
def semantic_search_query():

  if request.method == 'GET':
    query = request.args.get('query')
    return semantic_search(query)

  elif request.method == 'POST':
    query = request.json['query']
    return semantic_search(query)


print(f"Public url for the API... {public_url}")

app.run(port=port_no)