In [None]:
# https://pypi.org/project/pypandoc/
# pip install pypandoc_binary 

import pypandoc
import os
import time
import sys

start_time=time.time()
path ='c:\\temp_storage\\openai\\all-docs'

for root, directories, files in os.walk(path , topdown=False):
    for file in files:
        if file.lower().endswith(".docx"):
            name =(os.path.join(root,file))
            pypandoc.convert_file(name, 'plain', outputfile=name[:-5] +".txt")
end_time = time.time()
duration = end_time - start_time

print ("Cell Run Time: ", duration)

In [8]:
import os
import time
import pandas as pd
import openai
import re
import requests
import sys
from num2words import num2words
import numpy as np
from openai.embeddings_utils import get_embedding, cosine_similarity
import tiktoken

openai.api_key = os.getenv("OPENAI_API_KEY") 
openai.organization = os.getenv("OPENAI_ORGANIZATION") 

start_time=time.time()
path ='c:\\temp_storage\\openai\\docs' #example: 'c:\\openai\\test'

d = []
text=""

for root, directories, files in os.walk(path , topdown=False):
    for file in files:
        if file.lower().endswith(".txt"):
            name =(os.path.join(root,file))
            f = open(name, "r",encoding="utf-8")
            for line in f:
                text +=line
            f.close()
            d.append({'FILE NAME': file ,'CONTENT': text})
            pd.DataFrame(d)
            metadata_counter = 0
            text=""
end_time = time.time()
duration = end_time - start_time

In [9]:
df = pd.DataFrame(d)
df

Unnamed: 0,FILE NAME,CONTENT
0,DIJKSTRA EWD498 How Do We Tell Truths that Mig...,EWD498 How Do We Tell Truths that Might Hurt? ...
1,Bentley 1988 More Programming Pearls.txt,PART I: PROGRAMMING TECHNIQUES I don't have th...
2,The Art of Computer Programming v1.txt,Data usually has much more structural informat...
3,Tucker 2004 Computer Science.txt,Computer Science: The Discipline and its Impac...
4,Chaitin Thinking about Godel and Turing.txt,On the difficulty of computations Two practica...
5,NPHard Problems.txt,Approximation algorithms have developed in res...
6,On Numbers and Games Second Edition.txt,All Numbers Great and Small Whatever is notfor...
7,Peters 2015 Game Theory.txt,1.3.1 Zero-Sum Games The first example is base...


In [10]:
# s is input text
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.replace("#","")
    s = s.strip()
    
    return s

df['CONTENT'] = df["CONTENT"].apply(lambda x : normalize_text(x))

| GENERATION |TOKENIZER    | MAX INPUT TOKENS| KNOWLEDGE CUTOFF|
|------------|-------------|-----------------|-----------------|
| V2         | cl100k_base | 8191            | Sep 2021        |
| V1         | GPT-2/GPT-3 | 2046            | Aug 2020        |


https://beta.openai.com/docs/guides/embeddings/what-are-embeddings

https://openai.com/blog/new-and-improved-embedding-model/

In [11]:
tokenizer = tiktoken.get_encoding("cl100k_base")
df['n_tokens'] = df["CONTENT"].apply(lambda x: len(tokenizer.encode(x)))
df

Unnamed: 0,FILE NAME,CONTENT,n_tokens
0,DIJKSTRA EWD498 How Do We Tell Truths that Mig...,EWD498 How Do We Tell Truths that Might Hurt? ...,954
1,Bentley 1988 More Programming Pearls.txt,PART I: PROGRAMMING TECHNIQUES I don't have th...,2187
2,The Art of Computer Programming v1.txt,Data usually has much more structural informat...,639
3,Tucker 2004 Computer Science.txt,Computer Science: The Discipline and its Impac...,3262
4,Chaitin Thinking about Godel and Turing.txt,On the difficulty of computations Two practica...,3812
5,NPHard Problems.txt,Approximation algorithms have developed in res...,2484
6,On Numbers and Games Second Edition.txt,All Numbers Great and Small Whatever is notfor...,2331
7,Peters 2015 Game Theory.txt,1.3.1 Zero-Sum Games The first example is base...,3287


In [12]:
# Based on https://openai.com/api/pricing/ on 01/29/2023
# If you were using this for approximating pricing with Azure OpenAI adjust the values below with: https://azure.microsoft.com/pricing/details/cognitive-services/openai-service/

#MODEL	USAGE
#Ada     v1	$0.0040 / 1K tokens
#Babbage v1	$0.0050 / 1K tokens
#Curie   v1	$0.0200 / 1K tokens
#Davinci v1	$0.2000 / 1K tokens

#MODEL	USAGE
#Ada     v2	$0.0004 / 1K tokens
#This Ada model, text-embedding-ada-002, is a better and lower cost replacement for our older embedding models. 

n_tokens_sum = df['n_tokens'].sum()

ada_v1_embeddings_cost = (n_tokens_sum/1000) *.0040
babbage_v1_embeddings_cost = (n_tokens_sum/1000) *.0050
curie_v1_embeddings_cost = (n_tokens_sum/1000) *.02
davinci_v1_embeddings_cost = (n_tokens_sum/1000) *.2

ada_v2_embeddings_cost = (n_tokens_sum/1000) *.0004

print("Number of tokens: " + str(n_tokens_sum) + "\n")

print("MODEL        VERSION    COST")
print("-----------------------------------")
print("Ada" + "\t\t" + "v1" + "\t$" + '%.8s' % str(ada_v1_embeddings_cost))
print("Babbage" + "\t\t" + "v1" + "\t$" + '%.8s' % str(babbage_v1_embeddings_cost))
print("Curie" + "\t\t" + "v1" + "\t$" + '%.8s' % str(curie_v1_embeddings_cost))
print("Davinci" + "\t\t" + "v1" + "\t$" + '%.8s' % str(davinci_v1_embeddings_cost))
print("Ada" + "\t\t" + "v2" + "\t$" + '%.8s' %str(ada_v2_embeddings_cost))

Number of tokens: 18956

MODEL        VERSION    COST
-----------------------------------
Ada		v1	$0.075824
Babbage		v1	$0.09478
Curie		v1	$0.37912
Davinci		v1	$3.7912
Ada		v2	$0.007582


In [13]:
def generate_embeddings(text, model="text-embedding-ada-002"):
    return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
 
df['ada_v2_embedding'] = df.CONTENT.apply(lambda x: generate_embeddings(x, model='text-embedding-ada-002'))

In [14]:
df

Unnamed: 0,FILE NAME,CONTENT,n_tokens,ada_v2_embedding
0,DIJKSTRA EWD498 How Do We Tell Truths that Mig...,EWD498 How Do We Tell Truths that Might Hurt? ...,954,"[0.0077162147499620914, -0.0008686786168254912..."
1,Bentley 1988 More Programming Pearls.txt,PART I: PROGRAMMING TECHNIQUES I don't have th...,2187,"[0.0070382943376898766, 0.0012975081335753202,..."
2,The Art of Computer Programming v1.txt,Data usually has much more structural informat...,639,"[-0.0035451934672892094, 0.030073734000325203,..."
3,Tucker 2004 Computer Science.txt,Computer Science: The Discipline and its Impac...,3262,"[0.016959402710199356, -0.010407204739749432, ..."
4,Chaitin Thinking about Godel and Turing.txt,On the difficulty of computations Two practica...,3812,"[-0.026883335784077644, -0.024646427482366562,..."
5,NPHard Problems.txt,Approximation algorithms have developed in res...,2484,"[0.015040451660752296, 0.007307712454348803, -..."
6,On Numbers and Games Second Edition.txt,All Numbers Great and Small Whatever is notfor...,2331,"[-0.001979722874239087, -0.0022908940445631742..."
7,Peters 2015 Game Theory.txt,1.3.1 Zero-Sum Games The first example is base...,3287,"[-0.023110942915081978, -0.02227030321955681, ..."


In [15]:
# search embedded docs based on cosine similarity

def get_embedding(text, model="text-embedding-ada-002"):
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

def search_docs(df, user_query, top_n=3, to_print=True):
    embedding = get_embedding(
        user_query,
        model="text-embedding-ada-002"
    )
    df["similarities"] = df.ada_v2_embedding.apply(lambda x: cosine_similarity(x, embedding))

    res = (
        df.sort_values("similarities", ascending=False)
        .head(top_n)
    )
    if to_print:
        display(res)
    return res

question = input("How can I help you?\n\n")

res = search_docs(df, question, top_n=3)

How can I help you?

 Dangers of BASIC?


Unnamed: 0,FILE NAME,CONTENT,n_tokens,ada_v2_embedding,similarities
0,DIJKSTRA EWD498 How Do We Tell Truths that Mig...,EWD498 How Do We Tell Truths that Might Hurt? ...,954,"[0.0077162147499620914, -0.0008686786168254912...",0.786783
1,Bentley 1988 More Programming Pearls.txt,PART I: PROGRAMMING TECHNIQUES I don't have th...,2187,"[0.0070382943376898766, 0.0012975081335753202,...",0.76415
3,Tucker 2004 Computer Science.txt,Computer Science: The Discipline and its Impac...,3262,"[0.016959402710199356, -0.010407204739749432, ...",0.747116


In [16]:
res.CONTENT.values[0]

'EWD498 How Do We Tell Truths that Might Hurt? Sometimes we discover unpleasant truths. Whenever we do so, we are in difficulties: suppressing them is scientifically dishonest, so we must tell them, but telling them, however, will fire back on uso If the truths are sufficiently unpalatable, our audience is psycbically incapable of accepting them and we will be written off as totally unrealistic, hopelessly idealistic, dangerously revolutionary, foolishly gullible or what have you. (Besides that, telling such truths is a sure way of making oneself unpopular in many circles, and, as such, it is an act that, in general, is not without personal risks. Vide Galileo Galilei .. ) Computing Science seems to suffer severely from tbis conflict. On the whole, it remains silent and tries to escape tbis conflict by sbifting its attention. (For instance: with respect to COBOL you can really do only one of two tbings: fight the disease or pretend that it does not exist. Most Computer Science Departme

In [17]:
def search_docs(df, user_query, top_n=3, to_print=True):
    embedding = get_embedding(
        user_query,
        model="text-embedding-ada-002"
    )
    df["similarities"] = df.ada_v2_embedding.apply(lambda x: cosine_similarity(x, embedding))

    res = (
        df.sort_values("similarities", ascending=False)
        .head(top_n)
    )
    return res

res = search_docs(df, question, top_n=1)

ai_question = input("How can I help you?\n\n")


context= res.CONTENT.values
completion_model='text-davinci-003'

initial_prompt = "The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly."

combined_prompt = initial_prompt + str(context) + "Q: " + ai_question
response = openai.Completion.create(model=completion_model, prompt=combined_prompt, max_tokens=100)
ai_response = response['choices'][0]['text'].replace('\n', '').replace(' .', '.').strip()

print("\n"+ ai_response)

How can I help you?

 Dangers of BASIC?



A: BASIC can lead to the mental mutilation of potential programmers. Its teaching should be regarded as a criminal offense. It is a language that promotes bad programming habits and limits one's thinking abilities. Additionally, using BASIC can be inefficient and expensive in the long run.


In [18]:
print(combined_prompt)

The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly.['EWD498 How Do We Tell Truths that Might Hurt? Sometimes we discover unpleasant truths. Whenever we do so, we are in difficulties: suppressing them is scientifically dishonest, so we must tell them, but telling them, however, will fire back on uso If the truths are sufficiently unpalatable, our audience is psycbically incapable of accepting them and we will be written off as totally unrealistic, hopelessly idealistic, dangerously revolutionary, foolishly gullible or what have you. (Besides that, telling such truths is a sure way of making oneself unpopular in many circles, and, as such, it is an act that, in general, is not without personal risks. Vide Galileo Galilei .. ) Computing Science seems to suffer severely from tbis conflict. On the whole, it remains silent and tries to escape tbis conflict by sbifting its attention. (For instance: with respect to COBOL you can 