In [None]:
import os
import time
import pandas as pd
import openai
import re
import requests
import sys
from num2words import num2words
import numpy as np
from openai.embeddings_utils import get_embedding, cosine_similarity
import tiktoken

openai.api_key = os.getenv("OPENAI_API_KEY") 
openai.organization = os.getenv("OPENAI_ORGANIZATION") 

start_time=time.time()
path ='c:\\path_to_your_directory_with_files_to_ingest'

########### This helps takes care of removing metadata
search_string = "---" 
metadata_counter = 0
############
d = []
text=""

for root, directories, files in os.walk(path , topdown=False):
    for file in files:
        if file.lower().endswith(".md"):
            name =(os.path.join(root,file))
            f = open(name, "r",encoding="utf-8")
            for line in f:
                if line.find(search_string) !=-1 and metadata_counter !=2:
                    metadata_counter+=1
                if line.find(search_string) != 0 and metadata_counter==2:
                    text +=line
            f.close()
            d.append({'FILE NAME': file ,'CONTENT': text})
            pd.DataFrame(d)
            metadata_counter = 0
            text=""
end_time = time.time()
duration = end_time - start_time

print ("Script Execution: ", duration)

In [None]:
df = pd.DataFrame(d)
df

In [None]:
# s is input text
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.replace("#","")
    s = s.strip()
    
    return s

df['CONTENT'] = df["CONTENT"].apply(lambda x : normalize_text(x))

| GENERATION |TOKENIZER    | MAX INPUT TOKENS| KNOWLEDGE CUTOFF|
|------------|-------------|-----------------|-----------------|
| V2         | cl100k_base | 8191            | Sep 2021        |
| V1         | GPT-2/GPT-3 | 2046            | Aug 2020        |


https://beta.openai.com/docs/guides/embeddings/what-are-embeddings

https://openai.com/blog/new-and-improved-embedding-model/

In [None]:
tokenizer = tiktoken.get_encoding("cl100k_base")
df['n_tokens'] = df["CONTENT"].apply(lambda x: len(tokenizer.encode(x)))
df

In [None]:
# Based on https://openai.com/api/pricing/ on 01/29/2023
# If you were using this for approximating pricing with Azure OpenAI adjust the values below with: https://azure.microsoft.com/pricing/details/cognitive-services/openai-service/

#MODEL	USAGE
#Ada     v1	$0.0040 / 1K tokens
#Babbage v1	$0.0050 / 1K tokens
#Curie   v1	$0.0200 / 1K tokens
#Davinci v1	$0.2000 / 1K tokens

#MODEL	USAGE
#Ada     v2	$0.0004 / 1K tokens
#This Ada model, text-embedding-ada-002, is a better and lower cost replacement for our older embedding models. 

n_tokens_sum = df['n_tokens'].sum()

ada_v1_embeddings_cost = (n_tokens_sum/1000) *.0040
babbage_v1_embeddings_cost = (n_tokens_sum/1000) *.0050
curie_v1_embeddings_cost = (n_tokens_sum/1000) *.02
davinci_v1_embeddings_cost = (n_tokens_sum/1000) *.2

ada_v2_embeddings_cost = (n_tokens_sum/1000) *.0004

print("Number of tokens: " + str(n_tokens_sum) + "\n")

print("MODEL        VERSION    COST")
print("-----------------------------------")
print("Ada" + "\t\t" + "v1" + "\t$" + '%.8s' % str(ada_v1_embeddings_cost))
print("Babbage" + "\t\t" + "v1" + "\t$" + '%.8s' % str(babbage_v1_embeddings_cost))
print("Curie" + "\t\t" + "v1" + "\t$" + '%.8s' % str(curie_v1_embeddings_cost))
print("Davinci" + "\t\t" + "v1" + "\t$" + '%.8s' % str(davinci_v1_embeddings_cost))
print("Ada" + "\t\t" + "v2" + "\t$" + '%.8s' %str(ada_v2_embeddings_cost))

In [None]:
len(df)

In [None]:
df = df[df.n_tokens<3500]

len(df)

# OpenAI Text & Embedding Rate Limits?

Rate limits are enforced at the **organization level, not user level**, based on the specific endpoint used as well as the type of account you have. 

Rate limits are measured in two ways: **RPM (requests per minute)** and **TPM (tokens per minute)**. 

## TEXT & EMBEDDING

Free trial users •20 RPM •150,000 TPM

Pay-as-you-go users (first 48 hours)	•60 RPM •250,000 TPM

Pay-as-you-go users (after 48 hours)	•3,000 RPM •250,000 TPM

https://beta.openai.com/docs/guides/rate-limits/overview

In [None]:
import time
from IPython.display import clear_output

request_counter = 0
total_requests_sent = 0
rate_limit= 60

def generate_embeddings(text, model="text-embedding-ada-002"):
    global request_counter
    global rate_limit
    global total_requests_sent  
    clear_output(wait=True)
    
    if text=="":
        text = "blank"
        print("Blank content field detected")
    if request_counter < rate_limit:
        request_counter+=1
        total_requests_sent+=1
        print("Request counter: ", request_counter)
        print("Total requests sent: ", total_requests_sent)
    if request_counter == rate_limit:
        print("Sleeping for 60 seconds")
        time.sleep(60)
        request_counter = 0
        
    return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
 
df['ada_v2_embedding'] = df.CONTENT.apply(lambda x: generate_embeddings(x, model='text-embedding-ada-002'))

In [None]:
df

In [None]:
# search embedded docs based on cosine similarity

def get_embedding(text, model="text-embedding-ada-002"):
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

def search_docs(df, user_query, top_n=3, to_print=True):
    embedding = get_embedding(
        user_query,
        model="text-embedding-ada-002"
    )
    df["similarities"] = df.ada_v2_embedding.apply(lambda x: cosine_similarity(x, embedding))

    res = (
        df.sort_values("similarities", ascending=False)
        .head(top_n)
    )
    if to_print:
        display(res)
    return res

question = input("How can I help you?\n\n")

res = search_docs(df, question, top_n=3)

In [None]:
res.CONTENT.values[0]

In [None]:
def search_docs(df, user_query, top_n=3, to_print=True):
    embedding = get_embedding(
        user_query,
        model="text-embedding-ada-002"
    )
    df["similarities"] = df.ada_v2_embedding.apply(lambda x: cosine_similarity(x, embedding))

    res = (
        df.sort_values("similarities", ascending=False)
        .head(top_n)
    )
    return res

res = search_docs(df, question, top_n=1)

ai_question = input("How can I help you?\n\n")
context= res.CONTENT.values
completion_model='text-davinci-003'

initial_prompt = "The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly."

combined_prompt = initial_prompt + str(context) + "Q: " + ai_question
response = openai.Completion.create(model=completion_model, prompt=combined_prompt, max_tokens=100)
ai_response = response['choices'][0]['text'].replace('\n', '').replace(' .', '.').strip()

print("\n"+ ai_response)