In [9]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import FAISS
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [10]:
MODEL = "gpt-4o-mini"
db_name = "vector_db_FAISS"

In [11]:
load_dotenv(override=True)

openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")

OpenAI API Key exists and begins sk-proj-


### Read in documents using LangChain's loaders

In [12]:
folders = glob.glob("HII_Data/*")
text_loader_kwargs = {'encoding': 'utf-8'}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [13]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [14]:
len(chunks)

8

In [15]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

Document types found: Careers, ProgrammsandCourses, About, Student_Life, Profiles


In [16]:
embeddings = OpenAIEmbeddings()

### Create Vector DB with FAISS

In [17]:
vectorstore = FAISS.from_documents(chunks, embedding=embeddings)

total_vectors = vectorstore.index.ntotal
dimensions = vectorstore.index.d

print(f"There are {total_vectors} vectors with {dimensions:,} dimensions in the vector store")

There are 8 vectors with 1,536 dimensions in the vector store


### Chat Application (RAG pipeline with LangChain)

In [18]:
## create a new Chat with OpenAI (model)
llm = ChatOpenAI(temperature = 0.7, 
                 model_name = MODEL)

## set up the conversation memory for the chat (Memory)
memory = ConversationBufferMemory(
    memory_key = 'chat_history', 
    return_messages = True)

## the retriever is an abstraction over the VectorStore that will be used during RAG (Retreiver)
retriever = vectorstore.as_retriever()
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm = llm, 
    retriever = retriever, 
    memory = memory)

  memory = ConversationBufferMemory(


In [19]:
query = "Can you describe HII in a few sentences"
result = conversation_chain.invoke({"question":query})
print(result["answer"])

Horizon International Institute (HII) is a globally recognized higher education center established in 1995, dedicated to providing transformative education through innovation, research, and strong industry partnerships. With a mission to deliver high-quality, accessible education and foster research and innovation, HII empowers students with knowledge, creativity, and leadership skills. The institute is accredited by the National Board of Higher Education (NBHE) and has international partnerships with over 35 universities across Europe, Asia, and North America.


In [20]:
def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

In [21]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.
