In [None]:
# Import Dependencies
import json
import openai
import pandas as pd
import clickhouse_connect
import openai
import tiktoken
from ast import literal_eval
import warnings
from tqdm.auto import tqdm

# Warning Suppression
warnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
# Configurations
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"
BATCH_SIZE = 2000
characters_per_index = 32750 // 16 # Characters per API Limit

# Clickhouse Connection
client = clickhouse_connect.get_client(
      host='msc-37984436.us-east-1.aws.myscale.com',
      port=8443,
      username='lcai99',
      password='passwd_GTDU9YGWNkp9VV'
)

# Keys
openai.api_key = "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"

# File Paths
filename = 'prompt-completion-pairs-combined.json' # File containing prompt-completion pairs for training
SAVE_PATH = "combinedeigth.csv"

In [None]:
def split_string(string, chunk_size) -> list:
    """
    Splits a string into chunks of size chunk_size

    Args:
        string (str): String to split
        chunk_size (int): Size of each chunk

    Returns:
        list: List of chunks
    """
    
    return [string[i:i+chunk_size] for i in range(0, len(string), chunk_size)]

In [None]:
def create_string_list(filename, characters_per_index) -> list:
    """
    Creates a list of strings from a file

    Args:
        filename (str): Name of file to read
        characters_per_index (int): Number of characters per string

    Returns:
        list: List of strings
    """
    with open(filename, 'r') as file:
        content = file.read()

    string_list = split_string(content, characters_per_index)
    return string_list

In [None]:
# Load Data
data = create_string_list(filename, characters_per_index)
# print(len(data))

# Split Data into Segments
segmented_data = [data[i * len(data) // 15: (i + 1) * len(data) // 15] for i in range(15)]
segmented_data[-1] += data[14 * len(data) // 15:]  # Ensure last segment contains all remaining data

# Store Segments Into Data
data = segmented_data[0]

In [None]:
# Create Embeddings
embeddings = []

for batch_start in range(0, len(data), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = data[batch_start:batch_end]
    
    # Progress Indicator
    print(f"Batch {batch_start} to {batch_end-1}")
    
    # Create Embeddings using OpenAI API
    response = openai.Embedding.create(model=EMBEDDING_MODEL, input=batch)
    
    # Ensure Embeddings are in Same Order as Input
    for i, be in enumerate(response["data"]):
        assert i == be["index"]
        
    # Extract Embeddings and Append to List
    batch_embeddings = [e["embedding"] for e in response["data"]]
    embeddings.extend(batch_embeddings)

# Dataframe of Embeddings
df = pd.DataFrame({"text": data, "embedding": embeddings})

In [None]:
# Save Embeddings to CSV
df.to_csv(SAVE_PATH, index=False)

In [None]:
# Read Embeddings from CSV to Dataframe
df = pd.read_csv("combinedeigth.csv")
df['embedding'] = df['embedding'].str.strip('[]').str.split(',')

In [None]:
### Store Embeddings in ClickHouse ###
embedding_len = len(df['embedding'][0]) # 1536

# ClickHouse SQL Query to Create Table
client.command(f"""
CREATE TABLE IF NOT EXISTS default.hopkins_art
(
    id        UInt32,
    text      String,
    embedding Array(Float32),
    CONSTRAINT cons_embedding_len CHECK length(embedding) = {embedding_len},
    VECTOR INDEX article_content_index embedding TYPE HNSWFLAT('metric_type=Cosine')
)
ENGINE = MergeTree ORDER BY id
""")

# Insert Embeddings into ClickHouse in Batches
batch_size = 100
total_records = len(df)

# Convert Embeddings to List of Lists
data = df.to_records(index=False).tolist()
column_names = df.columns.tolist()

# Batch Insertion
for i in tqdm(range(21900, total_records, batch_size)):
    i_end = min(i + batch_size, total_records)
    client.insert("default.hopkins_art", data[i:i_end], column_names=column_names)
    
# Verify
print(f"articles count: {client.command('SELECT count(*) FROM default.hopkins_art')}") # Check Count of Data
get_index_status="SELECT status FROM system.vector_indices WHERE name='article_content_index'"
print(f"index build status: {client.command(get_index_status)}") # Check Index Status (Vector Index is Built)

In [None]:
def strings_ranked_by_relatedness(query: str) -> list[str]:
    """
    Returns a list of strings ranked by relatedness to the given query
    
    Args:
        query (str): Query string

    Returns:
        list[str]: List of strings ranked by relatedness to the given query
    """
    
    # Creates Embedding Vector from Query
    embed = openai.Embedding.create(
        input=query,
        model="text-embedding-ada-002",
    )["data"][0]["embedding"]

    # Query for Top K Similar Cases
    top_k = 10
    results = client.query(f"""
        SELECT id, text, distance(embedding, {embed}) as dist
        FROM default.hopkins_art
        ORDER BY dist
        LIMIT {top_k}
    """)

    # Top K Results
    return results.named_results()

In [None]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """
    Return the number of tokens in a string
    
    Args:
        text (str): String to count tokens
        
    Returns:
        int: Number of tokens in the string
    """
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

In [None]:
def query_message(query: str, model: str, token_budget: int) -> str:
    """
    Return a message for GPT, with relevant source texts pulled from a dataframe.
    
    Args:
        query (str): Query string

    Returns:
        str: Message for GPT
    """

    # Get Strings Ranked by Relatedness
    strings = strings_ranked_by_relatedness(query)
    
    # Prompt (TODO: Hallucinate for better prompts)
    question = f"\n\nQuestion: {query}"
    message = 'Use the below website information below to answer questions about the Johns Hopkins University. If the website information does not specify enough information, use previous knowledge to answer the question.'

    for string in strings:
        next_article = f'\n\nJohns Hopkins article section:\n"""\n{string}\n"""'
        # Check if adding the next article will exceed the token budget
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question

In [None]:
def ask(query: str, model: str = GPT_MODEL, token_budget: int = 4096 - 500, print_message: bool = True,) -> str:
    """
    Answers a query using GPT and a dataframe of relevant texts and embeddings.
    
    Args:
        query (str): Query string
        model (str): GPT model to use
        
    Returns:
        str: Answer to the query
    """
    message = query_message(query, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about student affairs at the Johns Hopkins University"},
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    return response_message