In [None]:
import os
import re

import openai
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# query through all paper summaries to group papers based on cosine similarity in keywords


def read_files(file_paths):
    all_text = ""
    for file_path in file_paths:
        try:
            with open(file_path, "r") as file:
                all_text += file.read() + "\n"
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    return all_text


def extract_keywords_from_text(text):
    try:
        # Use regex to find the "KEYWORDS" section
        match = re.search(r'"KEYWORDS":\s*\[(.*?)\]', text, re.DOTALL)
        if match:
            # Extract the keywords and remove unnecessary characters
            keywords_str = match.group(1)
            keywords = re.findall(r'"(.*?)"', keywords_str)
            return keywords
    except Exception as e:
        print(f"Error extracting keywords: {e}")
    return []


def preprocess_texts(folder_path):
    file_texts = []
    file_paths = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".summary"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, "r", encoding="utf-8") as file:
                    content = file.read()
                    keywords = extract_keywords_from_text(content)
                    file_texts.append(" ".join(keywords))
                    file_paths.append(file_path)
            except IOError as e:
                print(f"Error reading {file_path}: {e}")

    return file_texts, file_paths


def cluster_similar_papers(folder_path):
    file_texts, file_paths = preprocess_texts(folder_path)

    # Vectorize the texts using TF-IDF
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(file_texts)

    # Initialize groups
    groups = []

    # Process each paper
    for i, file_path in enumerate(file_paths):
        assigned = False
        for group in groups:
            group_texts, group_paths = zip(*group)
            group_similarity = cosine_similarity(
                X[i], vectorizer.transform(group_texts)
            )
            if any(similarity > 0.25 for similarity in group_similarity[0]):
                group.append((file_texts[i], file_path))
                assigned = True
                break
        if not assigned:
            groups.append([(file_texts[i], file_path)])

    # Split large groups into smaller groups of at most 5 papers
    new_groups = []
    for group in groups:
        while len(group) > 3:
            new_groups.append(group[:3])
            group = group[3:]
        new_groups.append(group)

    return new_groups


# Path to the folder containing the summary files
folder_path = "/Users/bayardwalsh/Desktop/CMSC 35350/cmsc35350_final_project/eth/papers"

# Get similar paper groups
similar_groups = cluster_similar_papers(folder_path)

# Print the similar groups
for group_id, group in enumerate(similar_groups):
    print(f"Group {group_id + 1}:")
    for _, file_path in group:
        print(file_path)
    print()


In [None]:
# connect to server for hypothesis generation
import sys
import time


port = 80

server_ip = sys.argv[1]

# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "cmsc-35360"
openai_api_base = f"http://195.88.24.64:{port}/v1"


prompt = (
    "Read the following summaries of scientific papers carefully. "
    "Given these paper summaries, generate a new scientific hypothesis related to the topics mentioned in the paper. "
    "This hypothesis should be new (not an exact copy of the shown hypotheses or another paper), scientifically testable, "
    "and either provable or disprovable. Please generate A SINGLE question and nothing else, just the hypothesis with the question mark. "
    "Here are the summaries to analyze:"
)

prompt2 = (
    "Please format this hypothesis as HYPOTHESIS: then put the generated hypothesis. Don't write <HYPOTHESIS>: or any other format"
    "as I want to parse through these responses and the format must be consistent. Note that the front tag of HYPOTHESIS: must be"
    "included in the output of this response."
)


def create_prompt(chunk):
    gpt_user_prompt = prompt + chunk + prompt2
    return gpt_user_prompt


client = openai.OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)


gpt_assistant_prompt = "You are a super smart AI that knows about science. You follow directions and you are always truthful and concise in your reponses."


def query_llm(gpt_user_prompt):
    message = [
        {"role": "assistant", "content": gpt_assistant_prompt},
        {"role": "user", "content": gpt_user_prompt},
    ]
    temperature = 0.0
    frequency_penalty = 0.0
    max_retries = 3
    retry_count = 0

    while retry_count < max_retries:
        try:
            response = client.chat.completions.create(
                model="meta-llama/Meta-Llama-3-70B-Instruct",
                messages=message,
                temperature=temperature,
                frequency_penalty=frequency_penalty,
            )
            time.sleep(2)
            return response.choices[0].message.content
            break
        except:
            print("Hypo trying again port " + str(port))
            retry_count += 1
            time.sleep(2)
            if retry_count == max_retries:
                print("Maximum retries reached. Exiting the program.")
                exit
    time.sleep(2)


In [None]:
df = pd.DataFrame(columns=["papers", "HYPOTHESIS"])


In [None]:
# for each group, read all paper summaries and generate hypothesis, save to dataframe

for group_id, group in enumerate(similar_groups):
    g = []
    for _, file_path in group:
        g.append(file_path)
    print(g)
    all_text = read_files(g)
    gpt_user_prompt = create_prompt(all_text)
    response = query_llm(gpt_user_prompt)
    print(response)
    hypothesis = response.split("HYPOTHESIS: ")[1]
    new_data = pd.DataFrame({"papers": [g], "HYPOTHESIS": [hypothesis]})
    df = pd.concat([df, new_data], ignore_index=True)


In [None]:
df.to_csv("hypothesis_out.csv", index=False)
