In [1]:
import os
import openai
import requests
import numpy as np
import xmltodict
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from openai.embeddings_utils import get_embedding
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import CharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load environment variables from .env file
load_dotenv()

# Set up OpenAI API key
api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = api_key


In [3]:
def extract_text_from(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, features="html.parser")
    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    return '\n'.join(line for line in lines if line)

def fetch_sitemap(url):
    r = requests.get(url)
    xml = r.text
    raw = xmltodict.parse(xml)
    return raw

def get_relevant_pages(sitemap, keyword):
    pages = []
    for info in sitemap['urlset']['url']:
        url = info['loc']
        if keyword in url:
            pages.append({'text': extract_text_from(url), 'source': url})
    return pages


In [None]:
# Example usage
sitemap_url = "https://www.rgu.ac.uk/index.php?option=com_jmap&view=sitemap&format=xml"
sitemap = fetch_sitemap(sitemap_url)
pages = get_relevant_pages(sitemap, 'international-students')

In [None]:
# pages[0]

In [None]:
def preprocess_text(pages):
    text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
    docs, metadatas = [], []
    for page in pages:
        splits = text_splitter.split_text(page['text'])
        docs.extend(splits)
        metadatas.extend([{"source": page['source']}] * len(splits))
    return docs, metadatas

# Example usage
docs, metadatas = preprocess_text(pages)

In [None]:
# docs

In [None]:
def generate_embeddings(docs):
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    return [model.encode(doc) for doc in docs]

# call function
embeddings = generate_embeddings(docs)


In [None]:
vector_store = {"documents": docs, "embeddings": embeddings, "metadatas": metadatas}

In [None]:
# File path from where you want to read the JSON
file_path = 've'

# Read the JSON file and convert it back to a dictionary
with open(file_path, 'r') as file:
    vector_store = json.load(file)


In [None]:
# Initialize the SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def generate_embeddings(docs):
    return [model.encode(doc) for doc in docs]

def generate_query_embedding(query):
    return model.encode(query)

def find_similar(query, vector_store, model, top_n=5):
    try:
        # Ensure necessary keys are present in vector_store
        if not all(key in vector_store for key in ['documents', 'embeddings', 'metadatas']):
            raise ValueError("Vector store must contain 'documents', 'embeddings', and 'metadatas' keys.")

        # Encode the query
        query_embedding = generate_query_embedding(query)
        
        # Ensure embeddings are of the same dimension
        assert len(query_embedding) == len(vector_store['embeddings'][0]), "Embedding dimensions do not match."
        
        # Compute similarity
        similarities = cosine_similarity([query_embedding], vector_store['embeddings'])[0]
        
        # Get indices of the top N most similar documents
        similar_indices = np.argsort(similarities)[-top_n:][::-1]
        
        # Collect the most similar documents, their metadata, and similarity scores
        similar_docs = [
            vector_store['documents'][i] for i in similar_indices
        ]
        
        return similar_docs
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

def generate_answer_gpt4(relevant_documents, question):
    # Combine relevant documents into a single context
    context = "\n\n".join(relevant_documents)
    
    # Create a prompt for OpenAI
    prompt = f"Based on the following documents, answer the question:\n\n{context}\n\nQuestion: {question}\nAnswer:"
    
    # Generate the response from OpenAI using the chat endpoint
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini-2024-07-18", 
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150,
        temperature=0.7
    )
    
    return response.choices[0].message['content'].strip()


def answer_question(query, vector_store, model, top_n=1):
    # Find similar documents
    similar_docs = find_similar(query, vector_store, model, top_n)
    
    # Generate and return an answer
    answer = generate_answer_gpt4(similar_docs, query)
    return answer

In [None]:
# Example usage
query = "What are the visa requirements for international students?"

# Get the answer
answer = answer_question(query, vector_store, model)
print(answer)

In [None]:
# def main():
#    while True:
#         # Prompt user for input
#         query = input("Enter your query (or type 'exit' to quit): ")
        
#         if query.lower() == 'exit':
#             print("Exiting the program.")
#             break
        
#         # Get the answer
#         answer = answer_question(query, vector_store, model)
#         print(f"Answer: {answer}")

# if __name__ == "__main__":
#     main()

### RGU CHAT

In [None]:
# Initialize the SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


def generate_embeddings(docs):
    return [model.encode(doc) for doc in docs]

def generate_query_embedding(query):
    return model.encode(query)

def find_similar(query, vector_store, model, top_n=1):
    try:
        # Ensure necessary keys are present in vector_store
        if not all(key in vector_store for key in ['documents', 'embeddings', 'metadatas']):
            raise ValueError("Vector store must contain 'documents', 'embeddings', and 'metadatas' keys.")

        # Encode the query
        query_embedding = generate_query_embedding(query)
        
        # Ensure embeddings are of the same dimension
        assert len(query_embedding) == len(vector_store['embeddings'][0]), "Embedding dimensions do not match."
        
        # Compute similarity
        similarities = cosine_similarity([query_embedding], vector_store['embeddings'])[0]
        
        # Get indices of the top N most similar documents
        similar_indices = np.argsort(similarities)[-top_n:][::-1]
        
        # Collect the most similar documents
        similar_docs = [
            vector_store['documents'][i] for i in similar_indices
        ]
        
        return similar_docs
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

def generate_answer_gpt4(relevant_documents, question):
    # Combine relevant documents into a single context
    context = "\n\n".join(relevant_documents)
    
    # Create a prompt for OpenAI
    prompt = f"Based on the following documents, answer the question:\n\n{context}\n\nQuestion: {question}\nAnswer:"
    
    # Generate the response from OpenAI using the chat endpoint
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  # Using the chat model
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150,
        temperature=0.7
    )
    
    return response.choices[0].message['content'].strip()

def answer_question(query, vector_store, model, top_n=1):
    # Define custom responses for specific queries
    greetings = ["hello", "hi", "greetings", "hey", "welcome"]
    if any(greeting in query.lower() for greeting in greetings):
        return "Welcome to Robert Gordon University! How can I assist you today?"

    # Find similar documents
    similar_docs = find_similar(query, vector_store, model, top_n)
    
    # Generate and return an answer
    answer = generate_answer_gpt4(similar_docs, query)
    return answer

In [None]:
def main():    
    while True:
        # Prompt user for input
        query = input("Enter your query (or type 'exit' to quit): ")
        
        if query.lower() == 'exit':
            print("Exiting the program.")
            break
        
        # Get the answer
        answer = answer_question(query, vector_store, model)
        print(f"Answer: {answer}")

if __name__ == "__main__":
    main()
