In [1]:
# IMPORTS
import time
import psutil
import GPUtil
from docx import Document
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

# Load data and preprocess
docx_file_path = 'NewFacultyOrientationResearchFAQs2021.docx'
txt_file_path = 'EditedNewFacultyOrientationResearch.txt'

# Create a text file from the Word document
doc = Document(docx_file_path)
with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
    for paragraph in doc.paragraphs:
        txt_file.write(paragraph.text + '\n')

# Making lists for the different entries
questions = []
answers = []
tags = []

# Read the text file and iterate through non-empty lines
with open(txt_file_path, 'r', encoding='utf-8') as txt_file:
    lines = [line.strip() for line in txt_file.readlines() if line.strip()]
    
    # Iterate through lines in sets of 3
    for i in range(0, len(lines), 3):
        # Check if there are enough lines left to extract a set
        if i + 2 < len(lines):
            questions.append(lines[i])
            answers.append(lines[i + 1])
            tags.append(lines[i + 2])
        else:
            # Not enough lines, skip
            print(f"Skipping incomplete set at index {i}.")

# Load pre-trained models
larger_model_name = "thenlper/gte-base"
larger_model_tokenizer = AutoTokenizer.from_pretrained(larger_model_name)
larger_model = AutoModel.from_pretrained(larger_model_name)

# Define functions for tag assignment and question selection
def assign_tags(question):
    if 'proposal' in question.lower():
        return 'proposal'
    elif 'routing sheet' in question.lower():
        return 'routing sheet'
    elif 'deadlines' in question.lower():
        return 'deadlines'
    elif 'research admin' in question.lower():
        return 'research admin'
    elif 'sponsored projects' in question.lower():
        return 'sponsored projects'
    elif 'bind' in question.lower():
        return 'bind'
    elif 'non-disclosure agreement (NDA)' in question.lower():
        return 'non-disclosure agreement (NDA)'
    else:
        return 'other'

def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

# Function to get GPU information
def get_gpu_info():
    try:
        gpu_info = GPUtil.getGPUs()[0]
        return f"GPU Usage: {gpu_info.load * 100:.2f}%"
    except Exception as e:
        return f"Error getting GPU information: {e}"

# Function to get CPU information
def get_cpu_info():
    cpu_info = psutil.cpu_percent(interval=1)
    return f"CPU Usage: {cpu_info:.2f}%"

# Function to get memory information
def get_memory_info():
    memory_info = psutil.virtual_memory().percent
    return f"Memory Usage: {memory_info:.2f}%"
# User Interface with Feedback Loop
def main():
    while True:
        user_question = input("Enter your question: ")

        # Record the start time
        start_time = time.time()

        # Tokenize user question
        user_input_dict = larger_model_tokenizer([user_question], max_length=512, padding=True, truncation=True, return_tensors='pt')
        user_embedding = average_pool(larger_model(**user_input_dict).last_hidden_state, user_input_dict['attention_mask'])

        # Record the end time
        end_time = time.time()

        # Calculate elapsed time
        elapsed_time = end_time - start_time

        # Print elapsed time
        print(f"Time Elapsed for Embedding: {elapsed_time:.4f} seconds")

        # Get GPU usage, CPU usage, and memory usage
        gpu_info = get_gpu_info()
        cpu_info = get_cpu_info()
        memory_info = get_memory_info()

        # Print resource usage information
        print(gpu_info)
        print(cpu_info)
        print(memory_info)

        # Tokenize all questions
        all_input_dict = larger_model_tokenizer(questions, max_length=512, padding=True, truncation=True, return_tensors='pt')
        all_embeddings = average_pool(larger_model(**all_input_dict).last_hidden_state, all_input_dict['attention_mask'])

        # Calculate similarity scores
        scores = (user_embedding @ all_embeddings.T).squeeze() * 100

        # Sort questions based on similarity scores
        sorted_indices = scores.argsort(descending=True)
        sorted_questions = [questions[i] for i in sorted_indices]

        # Display top 3 related questions using the larger model
        print("Top 3 Related Questions (larger model):")
        for i, question in enumerate(sorted_questions[:3]):
            print(f"{i+1}. {question}")

        # User selects a question
        selected_index_larger = int(input("Enter the number of the question you want to choose (larger model): ")) - 1

        # Display answer for the selected question using the larger model
        model_answer = answers[selected_index_larger]
        print(f"Answer (larger model): {model_answer}")

        # Ask for user feedback
        feedback = input("Is the answer correct? (yes/no): ")

        # If the user provides feedback, update the model
        if feedback.lower() == "no":
            new_answer = input("Please provide the correct answer: ")
            # Update the answers list with the correct answer
            answers[selected_index_larger] = new_answer

        # Ask if the user wants to continue
        continue_option = input("Do you want to ask another question? (yes/no): ")
        if continue_option.lower() != "yes":
            break

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm


Skipping incomplete set at index 39.
Enter your question: routing sheet
Time Elapsed for Embedding: 0.5738 seconds


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Error getting GPU information: list index out of range
CPU Usage: 28.70%
Memory Usage: 83.60%
Top 3 Related Questions (larger model):
1. Question: How soon should I start my Cayuse routing sheet?
2. Question: What are IIT’s internal deadlines?
3. Question: Where can I find more information on how to prepare my proposal?
Enter the number of the question you want to choose (larger model): 1
Answer (larger model): Answer: Please start a routing sheet (SP proposal) in IIT’s grants system, Cayuse.
Is the answer correct? (yes/no): no
Please provide the correct answer: Please start the first page, General tab, of the Cayuse routing sheet as soon as you know you are going to submit a proposal. OSRP prefers that you start your routing sheet at least 3 weeks prior to the deadline.
Do you want to ask another question? (yes/no): yes
Enter your question: deadlines
Time Elapsed for Embedding: 0.7833 seconds


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Error getting GPU information: list index out of range
CPU Usage: 13.70%
Memory Usage: 85.00%
Top 3 Related Questions (larger model):
1. Question: What are IIT’s internal deadlines?
2. Question: How soon should I start my Cayuse routing sheet?
3. Question: I would like to submit a proposal, how do I get started?
Enter the number of the question you want to choose (larger model): 1
Answer (larger model): Please start the first page, General tab, of the Cayuse routing sheet as soon as you know you are going to submit a proposal. OSRP prefers that you start your routing sheet at least 3 weeks prior to the deadline.
Is the answer correct? (yes/no): no
Please provide the correct answer: The administrative portions of the proposal (budget; budget justification; current and pending support (started by OSRP); biographical sketch; facilities, equipment and other resources; letters of collaboration, and solicitation specific administrative documents)  are due to the assigned OSRP research admini

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Error getting GPU information: list index out of range
CPU Usage: 30.80%
Memory Usage: 82.40%
Top 3 Related Questions (larger model):
1. Question: What are IIT’s internal deadlines?
2. Question: I am submitting a proposal to a foundation that does not allow indirect costs, or limited indirect costs to a rate lower than the IIT established rates, what do I do?
3. Question: How soon should I start my Cayuse routing sheet?
Enter the number of the question you want to choose (larger model): 1
Answer (larger model): The administrative portions of the proposal (budget; budget justification; current and pending support (started by OSRP); biographical sketch; facilities, equipment and other resources; letters of collaboration, and solicitation specific administrative documents)  are due to the assigned OSRP research administrator no later than 5 business days prior to the deadline. All proposal components, including all technical portions (project abstract, project narrative, and references) a