# Setup

## Importing Libraries

In [1]:
from dotenv import load_dotenv
import os
from langchain_community.document_loaders import JSONLoader
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.storage import InMemoryByteStore
from langchain.embeddings import CacheBackedEmbeddings
from langchain_community.vectorstores import Qdrant

## Importing ENV Variables

In [19]:
load_dotenv('../.env')

True

In [21]:
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

## Setting Up Embedddings and In-Memory Cache

In [22]:
underlying_embeddings = OpenAIEmbeddings()

store = InMemoryByteStore()

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, store, namespace=underlying_embeddings.model
)

In [23]:
list(store.yield_keys())

[]

## Loading JSON Data

In [24]:
loader = JSONLoader(
    file_path='../data/Winter2024.json',
    jq_schema='.[]',
    text_content=False
)

json_data = loader.load()

In [25]:
len(json_data)

8779

In [26]:
print(json_data[0])

page_content='{"course_name": "CULTURAL ANTHROPOLOGY", "course_crn": "30106", "course_id": "ANTH 001", "course_section": "001", "course_pre": "", "course_description": "4 Units# Lecture# 3 hours; discussion# 1 hour. Explores the basic contributions of anthropology to the understanding of human behavior and culture and the explanation of similarities and differences among human societies. Addresses the relevance of materials drawn from tribal and peasant culture to problems of the modern world. Stresses the application of anthropological methods to research problems. Credit is awarded for one of the following  ANTH 001#  ANTH 001H# or ANTH 001W.", "course_type": "Lecture", "course_time": "2:00 pm - 3:20 pm", "course_days": "MW", "course_location": "Online", "course_dates": "Jan 08 2024 - Mar 15 2024", "course_type2": "Lecture", "course_professor": "Worku Nida", "null": null}' metadata={'source': '/Users/andrew/Desktop/Projects/CourseGPT/data/Winter2024.json', 'seq_num': 1}


## Splitting Data

In [27]:
splitter = CharacterTextSplitter()

In [28]:
text_chunks = splitter.split_documents(json_data)

## Creating Database

In [31]:
db = Chroma.from_documents(text_chunks, underlying_embeddings)

## Testing Similarity Search

In [36]:
query = "When is CS100 offered"
docs = db.similarity_search(query)
print(docs[0].page_content)

{"course_name": "CONCURRENT ANALYTICAL STUDIES IN MEDIA AND CULTURAL STUDIES", "course_crn": "37152", "course_id": "MCS 292", "course_section": "S41", "course_pre": "", "course_description": "1 to 4 Units# Research# 3 to 12 hours. Prerequisite(s): graduate standing; consent of instructor and graduate advisor. To be taken concurrently with a 100-series course# but on an individual basis. Limited to research# criticism# and written work.  Normally graded Satisfactory (S) or No Credit (NC)# but students may petition the instructor for a letter grade if specialized topics are studied. Course is repeatable.", "course_type": "Research", "course_time": "TBA", "course_days": "", "course_location": "TBA", "course_dates": "Jan 08 2024 - Mar 15 2024", "course_type2": "Research", "course_professor": "Andrea L. Smith", "null": null}
