In [48]:
# !pip3 install pinecone-client --upgrade
# !pip3 install openai
# !pip3 install tiktoken
# !pip3 install protobuf
# !pip3 install --upgrade langchain
# !pip3 install "langchain[docarray]"
# !pip3 install utils
# !pip3 install -U setuptools
# !pip3 install python-dotenv
# !pip3 install pinecone-client google-api-python-client grpcio
# !pip3 install grpcio-tools
# !pip3 uninstall protobuf


In [49]:
import os
import json
from openai import OpenAI
import tiktoken
from IPython.display import display, Markdown
from dotenv import load_dotenv

from pinecone import Pinecone, ServerlessSpec

##### Setting up OpenAI

In [50]:
load_dotenv()
# OpenAI API key
openai_api = os.getenv('OPENAI_API_KEY')
openai_org = os.getenv('OPENAI_API_ORG')

client = OpenAI(
    api_key=openai_api,
    organization=openai_org
)

##### Setting up Pineconce

In [51]:
pinecone_api = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=pinecone_api)

##### Indexing Pineconce

In [52]:
index_name = "course-index"
namespace = 'all-courses-namespace'

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536, 
        metric="cosine", 
        spec=ServerlessSpec(
            cloud="aws", 
            region="us-east-1"
        ) 
    )

index = pc.Index(index_name)

In [53]:
with open('all_courses_latest.json', "r") as json_file:
    all_courses = json.load(json_file)
with open('spring.json', "r") as json_file:
    spring_courses = json.load(json_file)
with open('fall.json', "r") as json_file:
    fall_courses = json.load(json_file)
with open('course_info.json', "r") as json_file:
    course_info = json.load(json_file)

In [54]:
model_name = "text-embedding-3-small"

##### Functions for getting Embeddings and Upsert

In [55]:
def get_embedding(text):
    return client.embeddings.create(input=text, model=model_name).data[0].embedding

def num_tokens_from_string(string: str, encoding_name: str):
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def upsert_vectors(vec, namespace):
    batch_size = 100
    n = len(vec)
    n_batches = (n + batch_size - 1) // batch_size

    for i in range(n_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, n)
        batch_vectors = vec[start_idx:end_idx]

        index.upsert(batch_vectors, namespace=namespace)
        print(f"Upserted batch {i + 1}/{n_batches}")
    print("Successfully upserted!!")

def documents_already_indexed(index, namespace):
    stats = index.describe_index_stats()
    return stats['namespaces'].get(namespace, {}).get('vector_count', 0) > 0

##### Upserting into Pinecone

###### NOTE: skip this if vectors are already upserted

In [56]:
if not documents_already_indexed(index, namespace):
    vectors = []

    for i, course in enumerate(all_courses):
        semester = course["semester"]
        filename = course["textfile"]
        text_data = course_info[course["pk"]]["text"]
        embedding = get_embedding(text_data)
        vectors.append((str(i), embedding, {"pk": course["pk"]}))

    upsert_vectors(vectors, namespace)
    print("Documents have been upserted successfully.")
else:
    print("Documents are already indexed.")


Documents are already indexed.


#### Querying with Manual retrieval 

##### Querying Pinecone

In [60]:
query = "What courses are taught by Prof Zoran Tiganj?"

try:
    query_vector = get_embedding(query)
except Exception as e:
    print("Error calling OpenAI Embedding API: ", e)

search_response = index.query(
    top_k=20,
    vector=query_vector,
    include_metadata=True,
    namespace='all-courses-namespace')

##### Instructions for GPT

In [61]:
pks = [match['metadata']['pk'] for match in search_response['matches']]
texts = [course_info[pk]['text'] for pk in pks]

augmented_query = "\n\n---\n\nContext: ".join(texts)+"\n\n-----\n\n"+query

primer = f"""You are Q&A bot. A highly intelligent system that answers
user questions based on the information provided by the user above
each question. If the information can not be found in the information
provided by the user you truthfully say "I don't know".
"""

##### Querying GPT

In [62]:
try:   
    res = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": primer},
            {"role": "user", "content": augmented_query}
        ]
)   
    display(Markdown(f"\n{res.choices[0].message.content}"))
except Exception as e:
    print(f"Error with OpenAI Completion: {e}")


The following courses are taught by Prof. Zoran Tiganj:

1. **Course ID: CSCI-P556, Title: APPLIED MACHINE LEARNING**
   - Regular academic session, 3 course credits
   - Spring semester
   - In person LEC class
   - Meets on Tuesday, Thursday from 4:45 p.m.–6:00 p.m. at LU 1001
   - Notes: Total number of seats available for this component is 100, with 8 seats currently available and 0 seats on the waitlist.

In [63]:
# %pip3 install --upgrade --quiet  \
#     langchain-pinecone \
#     langchain-openai \
#     langchain \
#     langchain-community \
#     pinecone-notebooks

#### Querying with Automatic doc retrieval

In [64]:
with open('course_text.json', "r") as json_file:
    course_texts = json.load(json_file)

##### Converting to Langchain Document supported by Pinecone

In [65]:
from langchain.docstore.document import Document
course_docs = [
    Document(page_content=item["page_content"])
    for item in course_texts
]
# if in the future we need to add metadata \
# then use this arg in Document call metadata=item["metadata"])

print(course_docs[:2])

[Document(page_content='The course with the course ID ILS-Z640, titled "SEMINAR IN INTELLECTUAL FREEDOM", is offered by the Department of Computer Science and Informatics at Indiana University. The course is part of the regular academic session with 3 course credits. This course is taught by the professors Rosenbaum H.\nThe course has the following classes offered in spring semester:\n1. The LEC class is a in person type class. This class meets on Thursday from 12:40 p.m.–3:10 p.m. at I 232. The class is taught by the professors Howard Rosenbaum (shorthand name: Rosenbaum H)and is currently open. The total number of seats available for this component is 15, with 7 seats currently available and 0 seats on the waitlist.\nNotes: Above class meets with ILS-Z 764.\n\n'), Document(page_content='The course with the course ID INFO-I301, titled "PRESENTATIONS FOR IT PROFESSIONALS", is offered by the Department of Computer Science and Informatics at Indiana University. The course is part of the 

In [66]:
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

namespace = "all_course_texts"
embeddings = OpenAIEmbeddings()

pc = Pinecone(api_key=pinecone_api)

index_name = "langchain-index"
index = pc.Index(index_name)
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )


if not documents_already_indexed(index, namespace):
    docsearch = PineconeVectorStore.from_documents(documents=course_docs,
                                                index_name=index_name,
                                                embedding=embeddings,
                                                namespace=namespace,
                                                )
    print("Documents have been upserted successfully.")

else:
    docsearch = PineconeVectorStore(
        index_name=index_name,
        embedding=embeddings,
        namespace=namespace
    )
    print("Documents are already indexed.")


Documents are already indexed.


In [67]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'all_course_texts': {'vector_count': 560}},
 'total_vector_count': 560}

##### Querying GPT

In [70]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA  

llm = ChatOpenAI(
    openai_api_key=openai_api,
    model_name="gpt-4o",
    temperature=0.0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever()
)

query1 = "What are the courses taught by Professor Damir Cavar"

query1_with_knowledge = qa.invoke(query1)
query1_without_knowledge = llm.invoke(query1)

display(Markdown(query1_with_knowledge['result']))
display(query1_without_knowledge.content)

Professor Damir Cavar teaches the following courses at Indiana University:

1. **CSCI-B659: TOPICS ARTIFICIAL INTELLIGENCE**
   - **Class 1**: Meets on Monday, Wednesday from 4:45 p.m.–6:00 p.m. at BH 343. This class is currently closed with 0 seats available and 16 seats on the waitlist.
   - **Class 3**: Meets on Monday, Wednesday from 1:15 p.m.–2:30 p.m. at IF 0119. This class is currently closed with 0 seats available and 2 seats on the waitlist.

'As of my last update in October 2023, Professor Damir Cavar is known for his work in the fields of linguistics, computational linguistics, and natural language processing. However, the specific courses he teaches can vary by semester and institution. To get the most accurate and up-to-date information on the courses he is currently teaching, you should check the official website of the institution where he is employed or contact the relevant department directly.\n\nIf you provide the name of the institution where Professor Cavar is currently teaching, I might be able to give you more specific guidance on where to look for his course listings.'