Bena Smith 03/31/2025
This notebook contains the code to create embeddings for the people in other_people.csv and performs KNN on these embeddings and plots the people and their groups

In [None]:
import pandas as pd
import openai
import numpy as np
from sklearn.decomposition import PCA
from scipy.spatial import distance
from sklearn.cluster import KMeans
import plotly.express as px
import json
import textwrap
import ast

In [None]:
# Create Open AI client

client = openai.Client(api_key= "REPLACE")


In [4]:
# Import people

our_people = pd.read_csv("our_people.csv")
our_people.head()

Unnamed: 0,ID,Name,About_Me,Favorite_Music_Genres,Outgoing,Outdoorsy,Politically_Correct,Religion,Embeddings
0,1,Mei Zhang,"I am curious and creative, enjoy reading fanta...","Classical, Folk, Indie",4,3,2,Atheist,"[-0.012970875552855431, -0.03853305513039231, ..."
1,2,Priya Sharma,"I am adventurous and pragmatic, enjoy electron...","Pop, Rock",5,4,1,Christian,"[-0.031710869539529084, 0.004513157159090042, ..."
2,3,Adebayo Nkosi,"I am outgoing and analytical, enjoy jazz music...","Rap, Hip-hop",3,5,5,Muslim,"[-0.005966135114431382, -0.04169810477178544, ..."
3,4,Emily Carter,"I am introverted and empathetic, enjoy country...","Metal, Blues",2,2,3,Spiritualist,"[-0.025490914564579723, -0.03134411147329956, ..."
4,5,Arjun Patel,"I am cautious and creative, enjoy blues music,...","Electronic, Pop",4,1,4,Jewish,"[0.011702044680714604, -0.017951407516375185, ..."


In [5]:
our_people.dtypes

ID                        int64
Name                     object
About_Me                 object
Favorite_Music_Genres    object
Outgoing                  int64
Outdoorsy                 int64
Politically_Correct       int64
Religion                 object
Embeddings               object
dtype: object

In [6]:
#Set weights - we can also make these dynamic if a person finds one thing super important (i.e wants to meet only those with the same religion)

weights = {
    'About_Me': 1.0,
    'Favorite_Music_Genres': 0.5,
    'Outgoing': 0.3,
    'Outdoorsy': 0.3,
    'Politically_Correct': 0.3,
    'Religion': 0.5
}

In [None]:
# We have 2 options here (probably more!):
# 1. To convert numerical data to text data. This will be sent into openai's text embedding service
# 2. After the about me's are embedded, we can add a dimension that is these values scaled from 0-1 with min max scaling. 
# I chose 1 because openai can now compare features in the about me with these features and can extract new relationships
# ex. "Very outgoing" may be embedded similarly to words like "talkative," "friendly," "extroverted".

def convert_to_text(value, feature_name):
    if feature_name == "Outgoing":
        return ["Very introverted", "Somewhat introverted", "Neutral", "Somewhat outgoing", "Very outgoing"][int(value - 1)]
    elif feature_name == "Outdoorsy":
        return ["Hates outdoors", "Prefers indoors", "Neutral", "Likes outdoors", "Loves outdoors"][int(value - 1)]
    elif feature_name == "Politically_Correct":
        return ["Not PC at all", "Rarely PC", "Moderately PC", "Very PC", "Extremely PC"][int(value - 1)]

In [None]:
# USe open AI to create vector embeddings of sentences

def create_embeddings(about_mes):
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input = about_mes
    )
    response_dict = response.model_dump()
    return [data['embedding'] for data in response_dict['data']]



# current_person = our_people[our_people['ID'] == 1]['About_Me']
# other_people = our_people[our_people['ID'] != 1]['About_Me']

# current_person_embeddings = create_embeddings(current_person)
# other_people_embeddings = create_embeddings(other_people)



In [None]:
# This function gets a weighted embedding of for a persons about me and personal traits. 
# new_person is a dictionary with the information about a person
# our_people is the table of other users. This is just used to get the person's id
def get_weighted_embeddings(row):

    # change the number values by +1 because the client collects them from 0-4 but I think 1-5 
    # makes the variable more interpretable IMO so that's how they are stored 
    # Then convert the numbers to text so we can use them to get text embeddings
    row["Outgoing"] = convert_to_text(row["Outgoing"], "Outgoing") 
    row["Outdoorsy"] = convert_to_text(row["Outdoorsy"], "Outdoorsy")
    row["Politically_Correct"] = convert_to_text(row["Politically_Correct"], "Politically_Correct")

    # get embeddings for each characteristic
    about_me_embedding = np.array(create_embeddings([row['About_Me']])[0])
    music_embedding = np.array(create_embeddings([row['Favorite_Music_Genres']])[0])
    outgoing_embedding = np.array(create_embeddings([str(row['Outgoing'])])[0])
    outdoorsy_embedding = np.array(create_embeddings([str(row['Outdoorsy'])])[0])
    politically_correct_embedding = np.array(create_embeddings([str(row['Politically_Correct'])])[0])
    religion_embedding = np.array(create_embeddings([row['Religion']])[0])

    # Combine and weight the embeddings
    # A stronger weight means that the characteristic is more important that two friends share.
    # If it is very important that two friends share the same politically_correct level, politically_correct 
    # should be weighted highly
    combined_embedding = (
        weights['About_Me'] * about_me_embedding +
        weights['Favorite_Music_Genres'] * music_embedding +
        weights['Outgoing'] * outgoing_embedding +
        weights['Outdoorsy'] * outdoorsy_embedding +
        weights['Politically_Correct'] * politically_correct_embedding +
        weights['Religion'] * religion_embedding
    )
    
    return combined_embedding

In [10]:
our_people.head()

Unnamed: 0,ID,Name,About_Me,Favorite_Music_Genres,Outgoing,Outdoorsy,Politically_Correct,Religion,Embeddings
0,1,Mei Zhang,"I am curious and creative, enjoy reading fanta...","Classical, Folk, Indie",4,3,2,Atheist,"[-0.012970875552855431, -0.03853305513039231, ..."
1,2,Priya Sharma,"I am adventurous and pragmatic, enjoy electron...","Pop, Rock",5,4,1,Christian,"[-0.031710869539529084, 0.004513157159090042, ..."
2,3,Adebayo Nkosi,"I am outgoing and analytical, enjoy jazz music...","Rap, Hip-hop",3,5,5,Muslim,"[-0.005966135114431382, -0.04169810477178544, ..."
3,4,Emily Carter,"I am introverted and empathetic, enjoy country...","Metal, Blues",2,2,3,Spiritualist,"[-0.025490914564579723, -0.03134411147329956, ..."
4,5,Arjun Patel,"I am cautious and creative, enjoy blues music,...","Electronic, Pop",4,1,4,Jewish,"[0.011702044680714604, -0.017951407516375185, ..."


In [None]:
# get embeddings for each person
people_embeddings = [get_weighted_embeddings(row) for _, row in our_people.iterrows()]
our_people["Embeddings"] = people_embeddings

In [None]:
our_people.head()

Unnamed: 0,ID,Name,About_Me,Favorite_Music_Genres,Outgoing,Outdoorsy,Politically_Correct,Religion,Embeddings
0,1,Mei Zhang,"I am curious and creative, enjoy reading fanta...","Classical, Folk, Indie",4,3,2,Atheist,"[-0.012967251939699054, -0.03848275458440185, ..."
1,2,Priya Sharma,"I am adventurous and pragmatic, enjoy electron...","Pop, Rock",5,4,1,Christian,"[-0.03171318294480443, 0.004503151681274175, 0..."
2,3,Adebayo Nkosi,"I am outgoing and analytical, enjoy jazz music...","Rap, Hip-hop",3,5,5,Muslim,"[-0.005950302630662919, -0.041694598342292014,..."
3,4,Emily Carter,"I am introverted and empathetic, enjoy country...","Metal, Blues",2,2,3,Spiritualist,"[-0.025493642315268517, -0.03135855286382139, ..."
4,5,Arjun Patel,"I am cautious and creative, enjoy blues music,...","Electronic, Pop",4,1,4,Jewish,"[0.011702044680714604, -0.017951407516375185, ..."


In [None]:
# put the embeddings into json form so they are preserved nicely for the csv file 
our_people['Embeddings'] = our_people['Embeddings'].apply(lambda x: json.dumps(x.tolist()))
our_people.to_csv('our_people.csv', index=False)

In [22]:
our_people.dtypes

ID                        int64
Name                     object
About_Me                 object
Favorite_Music_Genres    object
Outgoing                  int64
Outdoorsy                 int64
Politically_Correct       int64
Religion                 object
Embeddings               object
dtype: object

In [None]:
# perform k means on the people embeddings to group them with people most similar to themselves
kmeans = KMeans(n_clusters=10)
our_people["Embeddings_lst"] = our_people["Embeddings"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
X = np.array(our_people["Embeddings_lst"].tolist())
kmeans.fit(X)
our_people["Group"] = kmeans.labels_

In [32]:
# Reduce dimensionality to 2D for visualization
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(people_embeddings)
our_people["x_embedding"] = reduced_embeddings[:, 0]
our_people["y_embedding"] = reduced_embeddings[:, 1]


In [None]:
# textwrap so the about mes are displayed nicely in the hover of the plot

our_people["About_Me"] = our_people["About_Me"].apply(lambda x: '<br>'.join(textwrap.wrap(str(x), width=40)))


In [None]:
# make group a string for categorical showing up of a plot instead of continuous 
our_people['Group'] = our_people['Group'].astype(str)


In [77]:

fig = px.scatter(x="x_embedding", y="y_embedding", color="Group", hover_data=["Name", "About_Me", "Favorite_Music_Genres", "Outgoing", "Outdoorsy", "Politically_Correct"], data_frame=our_people,
            title = "People Groups",
            labels={ 
                "x_embedding": "PC1",  "y_embedding": "PC2"
            },
            width=1000, height=500,
            )

# Customize hover label appearance
fig.for_each_trace(lambda trace: trace.update(
    hoverlabel=dict(
        bgcolor="white",  # Dark background with transparency
        font_size=14,  # Font size
        font_family="Arial",  # Font family
        font_color=trace.marker.color,  # White text
        bordercolor=trace.marker.color,  # Border color based on the trace's color
    ),
    hovertemplate=(
        "<b> Name: </b> %{customdata[0]}<br>"  # Show Name
        "<b> About Me: </b> %{customdata[1]}<br>"  # Show About Me
        "<b> Favorite Music Genres: </b> %{customdata[2]}<br>"  # Show Favorite Music Genres
        "<b> Outgoing: </b> %{customdata[3]}<br>"  # Show Outgoing
        "<b> Outdoorsy: </b> %{customdata[4]}<br>"  # Show Outdoorsy
        "<b> Politically Correct: </b> %{customdata[5]}<br>"  # Show Politically Correct
        "<extra></extra>"  # Remove trace info from the hover
    )
    ))

fig.update_layout(
    title_font_size=20, 
    title_font_family="Arial", 
    title_x=0.5,  # Center the title
    xaxis_title_font_size=14, 
    yaxis_title_font_size=14,
    showlegend=True,
    legend_title="Group",
    legend_title_font_size=14,
    legend_font_size=12,
    margin=dict(l=50, r=50, t=50, b=50),  # Adjust margins for better spacing
    template="plotly_white"  # Clean white background
)

fig.update_traces(marker=dict(size=12, opacity=0.8, line=dict(width=2, color='DarkSlateGrey')))

fig.show()

In [78]:
fig.write_html("../client/public/group_plot.html", full_html=True, include_plotlyjs=True)

In [None]:
# embedding = our_people.loc[our_people["ID"] == 1, "Embeddings"].values[0]
# embedding = embedding.strip('[]').split(' ')

The following code was made for server.py. Gets a current person and the other people and finds the n closest and n furthest from the current person

In [None]:
current_person = our_people[our_people['ID'] == 1]
current_person_embeddings = get_weighted_embeddings(current_person.iloc[0])  # Use .iloc[0] to get a row


In [None]:
current_person_embeddings

In [None]:
other_people = our_people[our_people['ID'] != 1]
other_people_embeddings = [get_weighted_embeddings(row) for _, row in other_people.iterrows()]


In [None]:
def find_n_closest(query_vector, embeddings, n=3):
    distances = []
    for index, embedding in enumerate(embeddings):
        embedding = embedding.tolist()
        dist = distance.cosine(query_vector, embedding)
        distances.append({"distance": dist, "index": index})
    distances_sorted = sorted(distances, key=lambda x: x["distance"])
    return distances_sorted[0:n]
        

In [None]:
print("Current person: ", current_person.iloc[0], "\n")
# print(current_person_embeddings)
# print(other_people_embeddings)
print("Other People:")
similar_people = find_n_closest(current_person_embeddings, other_people_embeddings)
for person in similar_people:
    index = person["index"]
    person_row = our_people.iloc[index]
    print(person_row)
    

In [None]:
def find_n_furthest(query_vector, embeddings, n=3):
    distances = []
    for index, embedding in enumerate(embeddings):
        dist = distance.cosine(query_vector, embedding)
        distances.append({"distance": dist, "index": index})
    distances_sorted = sorted(distances, key=lambda x: x["distance"])
    return distances_sorted[(len(distances_sorted)-1-n): (len(distances_sorted)-1)]

In [None]:
print("Not similar People:")
similar_people = find_n_furthest(current_person_embeddings, other_people_embeddings)
for person in similar_people:
    index = person["index"]
    person_row = other_people.iloc[index]
    print(person_row)


Another method would be to extract information from the "about me's" and put them in a more tabular form. For example if someone says they like Indie music in their "about me", we could classify them as someone who likes indie music in the music column. This way, we can weight these columns easier. If there is an indie concert soon, we might just want to invite people who enjoy this type of music. 

However, if there is a concert that's a genre I don't love but 222 thinks people I will be friends with will be there, I probably would still try it out to meet them!

We should consult academic texts about what personality types get along with others. Do opposites attract?