In [1]:
#pip install -U -q google.generativeai

In [2]:
#pip install -q chromadb

In [3]:
import textwrap
import chromadb
import numpy as np
import pandas as pd

import google.generativeai as genai
import google.ai.generativelanguage as glm

import os
from IPython.display import Markdown
from chromadb import Documents, EmbeddingFunction, Embeddings
from openai import OpenAI
import logging



In [4]:
# genai.configure(api_key='AIzaSyC1xnPaYuc0BXq-NOl9cGXKdzwLVRu4w-k')


client = OpenAI(api_key="api_key")

def generate_embedding(text):
    try:
        response = client.embeddings.create(
            input=text,
            model="text-embedding-ada-002"
        )
        # Extracting the embedding from the response
        embedding = response.data[0].embedding
        return embedding
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

text = "Example text"
embedding = generate_embedding(text)
print(embedding)

[-0.010369681753218174, 0.011489822529256344, 0.006858344189822674, -0.006271444261074066, -0.011335551738739014, 0.030505377799272537, -0.013146556913852692, -0.012710574083030224, -0.02016252465546131, -0.020175939425826073, 0.00676779355853796, 0.04011041298508644, -0.008786729536950588, -0.010872739367187023, 0.00585222989320755, 0.009323323145508766, 0.03723963350057602, -0.003873538924381137, 0.023985758423805237, -0.010758712887763977, -0.013273998163640499, -0.005765033420175314, -0.0014236513525247574, 0.006854990031570196, 0.0026041585952043533, -0.020122280344367027, 0.0082970866933465, -0.015145369805395603, -0.0034710934851318598, -0.01237520296126604, 0.01965276151895523, -0.01813688315451145, -0.012482521124184132, -0.0173051618039608, -0.008411113172769547, 0.00818976853042841, -0.01277094054967165, -0.008692825213074684, 0.03324200585484505, -0.017680777236819267, 0.006761086173355579, 0.013361194171011448, 0.010054432787001133, -0.014742923900485039, -0.02178572304546

In [5]:
def initialize_chroma_db_connection():
    client = chromadb.Client()
    return client

In [6]:
def chunk_text(text, max_tokens=1000):
    """
    Splits text into chunks where each chunk has a maximum number of tokens.
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_token_count = 0
    
    for word in words:
        word_token_count = 1 
        if current_token_count + word_token_count > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_token_count = word_token_count
        else:
            current_chunk.append(word)
            current_token_count += word_token_count
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

In [7]:
def add_document_to_chroma_db(document_text, embedding, db, collection_name="documents"):
    try:
        if collection_name not in db.list_collections():
            db.create_collection(name=collection_name)
    except:
    # except UniqueConstraintError:
        # If the collection already exists, we ignore the error and proceed
        pass

    collection = db.get_collection(collection_name)
    
    # Generate a unique ID for the document
    document_id = generate_unique_id_for_document(document_text)
    
    # Add the document to the collection with the correct parameters
    collection.add(
        ids=[document_id],  # list of unique identifiers
        embeddings=[embedding],  # list of embeddings
        documents=[document_text]  # list of document texts
    )

def generate_unique_id_for_document(document_text):
    return str(hash(document_text))

In [8]:
def retrieve_relevant_passages(query, db, num_results=4, collection_name="documents"):
    collection = db.get_collection(collection_name)
    # using same embedding generator as we did for the transcripts
    query_embedding = generate_embedding(query) 
    
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=num_results
    )
    
    return [result for result in results['documents']]

In [9]:
def process_and_store_documents(base_path, db):
    file_count = 1
    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='latin-1') as f:
                    print(f"Processing file {file_count}")
                    content = f.read()
                    chunks = chunk_text(content)
                    for chunk in chunks:
                        embedding = generate_embedding(chunk)
                        add_document_to_chroma_db(chunk, embedding, db)
                    file_count += 1

In [10]:
def query_with_gpt4(query, context_chunks):
    # prompt = f"Answer the following question based on the provided context: {query}\n\nContext: {' '.join(context_chunks)}"
    prompt_instructions = """
    You are a helpful and informative bot that answers questions primarily using information from the reference passages provided. 
    Please note that the reference passages might have some typos and incorrect grammar. 
    Focus on the information provided in the reference passages, but when needed, you can use your own knowledge too. 
    Your audience may be non-technical, so try to break down complicated concepts and use analogies where possible. 
    In your answer, do not mention referring to any passages or a database.
    If the question is unclear or there are multiple possible interpretations, ask the user for clarification.
    """

    context_chunks = [str(chunk) for chunk in context_chunks]
    context_chunks_string = ' '.join(context_chunks)

    completion = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": prompt_instructions},
        {"role": "user", "content": f"Query: {query}\n\nReference Passages: {context_chunks_string}"}
    ]
    )
    return completion.choices[0].message.content
    # print(completion.choices[0].message.content)    

# def query_with_gpt4(query, context_chunks):
#     context_chunks = [str(chunk) for chunk in context_chunks]
#     context_chunks_string = ' '.join(context_chunks)

#     prompt = f"Answer the following question based on the provided context: {query}\n\nContext: {context_chunks_string}"
#     response = openai.Completion.create(
#         engine="gpt-4",
#         prompt=prompt,
#         max_tokens=150,
#         temperature=0.7,
#         top_p=1.0,
#         frequency_penalty=0.0,
#         presence_penalty=0.0
#     )
#     return response.choices[0].text.strip()

In [11]:
dataset_path = './transcripts'

db = initialize_chroma_db_connection()

# setting logging level to WARNING only to not be bombarded with informational messages
logging.basicConfig(level=logging.WARNING)

process_and_store_documents(dataset_path, db)

Processing file 1
Processing file 2
Processing file 3
Processing file 4
Processing file 5
Processing file 6
Processing file 7
Processing file 8
Processing file 9
Processing file 10
Processing file 11
Processing file 12
Processing file 13
Processing file 14
Processing file 15
Processing file 16
Processing file 17
Processing file 18
Processing file 19
Processing file 20
Processing file 21
Processing file 22
Processing file 23
Processing file 24
Processing file 25
Processing file 26
Processing file 27
Processing file 28
Processing file 29
Processing file 30
Processing file 31
Processing file 32
Processing file 33
Processing file 34
Processing file 35
Processing file 36
Processing file 37
Processing file 38
Processing file 39
Processing file 40
Processing file 41
Processing file 42
Processing file 43
Processing file 44
Processing file 45


In [12]:
# just some testing

text = "Example text to test embedding generation."
embedding = generate_embedding(text)
print(f"Dimensionality of generated embedding: {len(embedding)}")

Dimensionality of generated embedding: 1536


In [15]:
query = "Who is Travis Stevens?"

relevant_passages = retrieve_relevant_passages(query, db, num_results=4)
for passage in relevant_passages: 
    print(passage)

answer = query_with_gpt4(query, relevant_passages)
Markdown(answer)

["(Lex flipping) - That was intense. This video is a Judo training session with Travis Stevens from a while back. He's a 2016 Olympic silver medalist in Judo, and one of the greatest American judoka ever. I've trained judo and jujitsu for many years, and took some time off on the judo side, focusing more on submission grappling than throwing. But I'm hoping to get back into it with Jimmy Pedro, the legendary judo coach out in Boston. I love judo too much. I miss it. Big thank you to Craig Jones and the B Team for letting us use their gym for this judo session. And I should also say that Austin, in general, is amazing for martial arts. We've got 10th Planet with Gabe Tuttle, New Wave with John Danaher, Gordon Ryan, Nicholas Meregali, and others. B Team with Craig Jones, Nikki Rod, Nikki Ryan, and others, and many more. I highly recommend you check out these places, and who knows? Maybe you'll see me there, and we get a chance to train. And now, dear friends, here's the 2016 Olympic silv

Travis Stevens is a renowned judoka who won a silver medal in the 2016 Olympics. He is considered one of the greatest American judokas ever. In addition to his achievements in Judo, Stevens has also trained in jujitsu. He is known for his effective judo training sessions and his deep understanding of the sport, including its various techniques and strategies.