In [110]:
import os
import openai
import requests
import numpy as np
import xmltodict
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from openai.embeddings_utils import get_embedding
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import CharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity

In [124]:
# Load environment variables from .env file
load_dotenv()

# Set up OpenAI API key
api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = api_key


In [2]:
def extract_text_from(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, features="html.parser")
    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    return '\n'.join(line for line in lines if line)

def fetch_sitemap(url):
    r = requests.get(url)
    xml = r.text
    raw = xmltodict.parse(xml)
    return raw

def get_relevant_pages(sitemap, keyword):
    pages = []
    for info in sitemap['urlset']['url']:
        url = info['loc']
        if keyword in url:
            pages.append({'text': extract_text_from(url), 'source': url})
    return pages


In [3]:
# Example usage
sitemap_url = "https://www.rgu.ac.uk/index.php?option=com_jmap&view=sitemap&format=xml"
sitemap = fetch_sitemap(sitemap_url)
pages = get_relevant_pages(sitemap, 'international-students')

In [108]:
# pages[0]

In [109]:
def preprocess_text(pages):
    text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
    docs, metadatas = [], []
    for page in pages:
        splits = text_splitter.split_text(page['text'])
        docs.extend(splits)
        metadatas.extend([{"source": page['source']}] * len(splits))
    return docs, metadatas

# Example usage
docs, metadatas = preprocess_text(pages)

In [106]:
# docs

In [128]:
def generate_embeddings(docs):
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    return [model.encode(doc) for doc in docs]

# call function
embeddings = generate_embeddings(docs)


In [131]:
vector_store = {"documents": docs, "embeddings": embeddings, "metadatas": metadatas}

In [165]:
# Initialize the SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def generate_embeddings(docs):
    return [model.encode(doc) for doc in docs]

def generate_query_embedding(query):
    return model.encode(query)

def find_similar(query, vector_store, model, top_n=5):
    try:
        # Ensure necessary keys are present in vector_store
        if not all(key in vector_store for key in ['documents', 'embeddings', 'metadatas']):
            raise ValueError("Vector store must contain 'documents', 'embeddings', and 'metadatas' keys.")

        # Encode the query
        query_embedding = generate_query_embedding(query)
        
        # Ensure embeddings are of the same dimension
        assert len(query_embedding) == len(vector_store['embeddings'][0]), "Embedding dimensions do not match."
        
        # Compute similarity
        similarities = cosine_similarity([query_embedding], vector_store['embeddings'])[0]
        
        # Get indices of the top N most similar documents
        similar_indices = np.argsort(similarities)[-top_n:][::-1]
        
        # Collect the most similar documents, their metadata, and similarity scores
        similar_docs = [
            vector_store['documents'][i] for i in similar_indices
        ]
        
        return similar_docs
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

def generate_answer_gpt4(relevant_documents, question):
    # Combine relevant documents into a single context
    context = "\n\n".join(relevant_documents)
    
    # Create a prompt for OpenAI
    prompt = f"Based on the following documents, answer the question:\n\n{context}\n\nQuestion: {question}\nAnswer:"
    
    # Generate the response from OpenAI using the chat endpoint
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini-2024-07-18", 
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150,
        temperature=0.7
    )
    
    return response.choices[0].message['content'].strip()


def answer_question(query, vector_store, model, top_n=1):
    # Find similar documents
    similar_docs = find_similar(query, vector_store, model, top_n)
    
    # Generate and return an answer
    answer = generate_answer_gpt4(similar_docs, query)
    return answer

In [166]:
# Example usage
query = "What are the visa requirements for international students?"

# Get the answer
answer = answer_question(query, vector_store, model)
print(answer)

International students applying for a Graduate visa in the UK do not have to meet specific financial requirements or demonstrate a certain amount of money, nor do they need to provide evidence of English language proficiency, as these were already satisfied during their previous Tier 4 or Student visa application. Additionally, after the initial 2-year period on the Graduate visa, students have the option to switch to a Skilled Worker visa if they secure a job that meets the necessary criteria for that visa category.


In [168]:
# def main():
#    while True:
#         # Prompt user for input
#         query = input("Enter your query (or type 'exit' to quit): ")
        
#         if query.lower() == 'exit':
#             print("Exiting the program.")
#             break
        
#         # Get the answer
#         answer = answer_question(query, vector_store, model)
#         print(f"Answer: {answer}")

# if __name__ == "__main__":
#     main()

### RGU CHAT

In [173]:
# Initialize the SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


def generate_embeddings(docs):
    return [model.encode(doc) for doc in docs]

def generate_query_embedding(query):
    return model.encode(query)

def find_similar(query, vector_store, model, top_n=1):
    try:
        # Ensure necessary keys are present in vector_store
        if not all(key in vector_store for key in ['documents', 'embeddings', 'metadatas']):
            raise ValueError("Vector store must contain 'documents', 'embeddings', and 'metadatas' keys.")

        # Encode the query
        query_embedding = generate_query_embedding(query)
        
        # Ensure embeddings are of the same dimension
        assert len(query_embedding) == len(vector_store['embeddings'][0]), "Embedding dimensions do not match."
        
        # Compute similarity
        similarities = cosine_similarity([query_embedding], vector_store['embeddings'])[0]
        
        # Get indices of the top N most similar documents
        similar_indices = np.argsort(similarities)[-top_n:][::-1]
        
        # Collect the most similar documents
        similar_docs = [
            vector_store['documents'][i] for i in similar_indices
        ]
        
        return similar_docs
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

def generate_answer_gpt4(relevant_documents, question):
    # Combine relevant documents into a single context
    context = "\n\n".join(relevant_documents)
    
    # Create a prompt for OpenAI
    prompt = f"Based on the following documents, answer the question:\n\n{context}\n\nQuestion: {question}\nAnswer:"
    
    # Generate the response from OpenAI using the chat endpoint
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  # Using the chat model
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150,
        temperature=0.7
    )
    
    return response.choices[0].message['content'].strip()

def answer_question(query, vector_store, model, top_n=1):
    # Define custom responses for specific queries
    greetings = ["hello", "hi", "greetings", "hey", "welcome"]
    if any(greeting in query.lower() for greeting in greetings):
        return "Welcome to Robert Gordon University! How can I assist you today?"

    # Find similar documents
    similar_docs = find_similar(query, vector_store, model, top_n)
    
    # Generate and return an answer
    answer = generate_answer_gpt4(similar_docs, query)
    return answer

In [174]:
def main():    
    while True:
        # Prompt user for input
        query = input("Enter your query (or type 'exit' to quit): ")
        
        if query.lower() == 'exit':
            print("Exiting the program.")
            break
        
        # Get the answer
        answer = answer_question(query, vector_store, model)
        print(f"Answer: {answer}")

if __name__ == "__main__":
    main()


Enter your query (or type 'exit' to quit):  hello


Answer: Welcome to Robert Gordon University! How can I assist you today?


Enter your query (or type 'exit' to quit):  hellooo


Answer: Welcome to Robert Gordon University! How can I assist you today?


Enter your query (or type 'exit' to quit):  hi


Answer: Welcome to Robert Gordon University! How can I assist you today?


Enter your query (or type 'exit' to quit):  whatsaup


Answer: It seems like you're asking for a casual greeting or an informal update. If you're looking for information or have a specific question about the documents or related topics, feel free to ask!


Enter your query (or type 'exit' to quit):  clear


Answer: It seems that you have provided a prompt, but I need more context or specific documents to generate a relevant answer. Could you please provide additional details or clarify your question?


Enter your query (or type 'exit' to quit):  sponsor


Answer: A sponsor is an individual or entity that provides financial support for a person's course fees and living costs, particularly in the context of visa applications. This can include official financial sponsors such as the UK government, your national government, the British Council, international organizations, international companies, universities, and independent schools. To demonstrate sponsorship, you need a letter from the sponsor that includes your name, their contact details, the length of the sponsorship, and the amount of money they will contribute, or a statement confirming that all fees and costs will be covered. This letter must be dated within the last six months when applying for a visa.


Enter your query (or type 'exit' to quit):  exit


Exiting the program.
