In [1]:


import ast  # for converting embeddings saved as strings back to arrays
from openai import OpenAI # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
import os # for getting API token from env variable OPENAI_API_KEY
from scipy import spatial  # for calculating vector similarities for searchik
import json
from openai import OpenAI
import os
import re

In [2]:
import pandas as pd

courses = pd.read_json("courses2.json")
course_texts = []
for i in range(len(courses)):
    (
        course_title,
        course_description,
        course_credits,
        subject_notes,
        breadth,
        prereqs,
        course_attributes,
        full_course_title,
        course_level,
    ) = (None, None, None, None, None, None, None, None, None)
    if courses["title"][i]:
        course_title = courses["title"][i]
    if courses["description"][i]:
        course_description = courses["description"][i]
    if courses["level"][i]:
        course_level = courses["level"][i]
    if courses["credits"][i]:
        course_credits = courses["credits"][i]
    if courses["subject_notes"][i]:
        subject_notes = courses["subject_notes"][i]
    if courses["breadth"][i]:
        breadth = " and ".join(courses["breadth"][i])
    if courses["prereqs"][i]:
        prereqs = courses["prereqs"][i]
    if courses["course_attributes"][i]:
        course_attributes = " and ".join(courses["course_attributes"][i])
    if courses["full_course_title"][i]:
        full_course_title = courses["full_course_title"][i]
    if courses['level'][i]:
        course_level = courses['level'][i]
        
    course_paragraph = []
    if course_title and full_course_title:
        course_paragraph.append(
            f"Course title {full_course_title} is abbreviated as {course_title}."
        )
    if course_description:
        course_paragraph.append(
            f"The course description for {full_course_title} is: {course_description}."
        )
    if course_level:
        course_paragraph.append(f"This course is at an {course_level} level.")
    if course_credits:
        course_paragraph.append(f"It is worth {course_credits} credits.")
    if subject_notes:
        course_paragraph.append(
            f"The subject notes for this course are {subject_notes}."
        )
    if breadth:
        course_paragraph.append(f"The course is a {breadth} course.")
    if prereqs:
        course_paragraph.append(f"The prerequisites are: {prereqs}.")
    # if course_attributes:
    #     course_paragraph.append(f"The course attributes are {course_attributes}.")

    course_texts.append(". ".join(course_paragraph))

course_texts = [course.lower() for course in course_texts]


def replace_multiple_periods(text):
    """Replace sequences of periods with a single period in a given text."""
    return re.sub(r"\.{2,}", ".", text)


course_texts = [replace_multiple_periods(course) for course in course_texts]
print(course_texts[0])

course title cooperative education program is abbreviated as bse 1. the course description for cooperative education program is: full-time off-campus work experience which combines classroom theory with practical knowledge of operations to provide a background upon which to base a professional career. it is worth 1.00 credit credits. the subject notes for this course are anita thompson, chair, 115e ag engr, (608)262-3310. the prerequisites are: consent of instructor.


In [3]:
major_requirements = json.load(open("major_requirements.json"))
major_texts = []

for major_name, major_requirement in major_requirements.items():
    major_texts.append(
        f"The major requirements for {major_name} are as follows: {major_requirement}"
    )

major_texts = [major.lower() for major in major_texts]
# Truncate to 8192 tokens
course_texts = [text[:8192] for text in course_texts]

In [4]:
os.environ["OPENAI_API_KEY"] = "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"
client = OpenAI()
texts = course_texts + major_texts
data = pd.DataFrame({"text": texts})
texts = [
    text[: 8192 * 3] for text in texts
]  # Average chars/token is like 4, so 8192*3 will hopefully get most of it.


def batch_texts(texts, max_batch_size=100):
    for i in range(0, len(texts), max_batch_size):
        yield texts[i : i + max_batch_size]


EMBEDDING_MODEL = "text-embedding-3-small"

all_embeddings = []
for batch in batch_texts(texts):
    try:
        res = client.embeddings.create(input=batch, model=EMBEDDING_MODEL)
        all_embeddings.extend(res.data)
    except Exception as e:
        print(f"Error with batch: {e}")


data["embedding"] = [embedding.embedding for embedding in all_embeddings]
data.to_json("courses_with_embeddings.json", orient="records")

In [5]:
# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 25
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]


In [7]:
with open('chat_intro.md', 'r') as f:
    chat_intro = f.read()
    
with open('recommend_intro.md', 'r') as f:
    recommend_intro = f.read()

In [8]:
GPT_MODEL = "gpt-3.5-turbo"


def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def query_message(
    query: str, df: pd.DataFrame, model: str, token_budget: int, chat=False
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, _ = strings_ranked_by_relatedness(query, df)
    question = f"\n\nQuestion: {query}"
    if chat:  # If chatting, not recommnending
        message = """Use the following course and major requirement information to identify yourself, and provide context for your date. \n\n"""
    else:
        message = """Use the following course and major requirement information to recommend courses relevant to the student. \n\n"""
    for string in strings:
        if num_tokens(message + string + question, model=model) > token_budget:
            break
        else:
            message += string
    return message + question

In [9]:
conversation_histories = {}  # course title: conversation history

def chat(
    course_title,
    message,
    model=GPT_MODEL,
    conversation_histories=conversation_histories,
    data=data,
    chat_intro=chat_intro,
):
    if course_title in conversation_histories:
        conversation_history = conversation_histories[course_title]
    else:
        conversation_history = []

    chat_intro = f'{course_title}. You are personifying {course_title} on Tinder.' + "\n\n" + chat_intro
    
    messages = [
        {"role": "system", "content": chat_intro},
        *conversation_history,  # Unpack the conversation history
        {"role": "user", "content": message},  # Add the current user message
    ]

    
    message = query_message(message, df=data, model=model, token_budget=8192, chat = True)

    response = client.chat.completions.create(
        model=GPT_MODEL, messages=messages, temperature=0
    )

    response_message = response.choices[0].message.content

    # Append the user message and the GPT (assistant) response to the history
    conversation_history.append({"role": "user", "content": message})
    conversation_history.append({"role": "assistant", "content": response_message})

    conversation_histories[course_title] = conversation_history
    return response_message


def recommend(message):
    message = query_message(message, df=data, model=GPT_MODEL, token_budget=8192, chat = False)
    messages = [
        {"role": "system", "content": recommend_intro},
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model=GPT_MODEL, messages=messages, temperature=0
    )
    response_message = response.choices[0].message.content
    return response_message

In [10]:
recommend('I am a Computer Science and Data Science Major. I have taken CS 354, CS 252, CS 200, CS 300, CS320, CS 400, LIS 461, STAT 340, STAT 240. I dislike math and love humanities. I dislike LIS 201, I dislike CS 577')

'comp sci 532, comp sci 319, comp sci 354, comp sci 352, comp sci 763, comp sci 544, comp sci 320, comp sci 402, comp sci 570, stat 605, stat 405, stat 605'

In [11]:
chat('comp sci 577', 'sex')

"I'm sorry, but I'm here to embody a specific college course and engage in conversations related to that. If you're interested in learning more about Comp Sci 577: Advanced Topics in Computer Science, feel free to ask me anything about the course!"

In [12]:
print(conversation_histories['comp sci 577'])

[{'role': 'user', 'content': 'Use the following course and major requirement information to identify yourself, and provide context for your date. \n\ncourse title human sexuality is abbreviated as soc 453. the course description for human sexuality is: provides an interdisciplinary introduction to biological, psychological, and sociological aspects of human sexuality. this course is at an intermediate level. it is worth 4.00 credits credits. the subject notes for this course are send course or timetable questions to undergraduate coordinator ted babcock, (608) 262-3261, ted.babcock@wisc.edu\nsee our enrollment help page:\nhttps://sociology.wisc.edu/undergraduate-program/sociology-enrollment-help/\n* if you are a transfer student and have completed an introductory sociology class elsewhere, please contact us so we can put you into a student group that will allow you to enroll in any sociology class that requires it.\ndepartment chair: prof. eric grodsky, 8128b sewell social sciences bld