In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def generate_similarity_matrix(embeddings, df):
    """
    Generate similarity matrix from data
    :param data: pandas DataFrame
    :return: similarity matrix
    """
    user_ids = df.index

    # Assume user_embeddings is an (N, D) matrix, where:
    # - N = number of users
    # - D = embedding dimension

    similarity_matrix = cosine_similarity(embeddings)

    # Convert to DataFrame for easier lookup
    similarity_df = pd.DataFrame(similarity_matrix, index=user_ids, columns=user_ids)

    return similarity_df

In [3]:
user_embedding = np.load('okcupid_profiles_preprocessed.npy')
df = pd.read_csv('okcupid_profiles_cleaned.csv')

In [4]:
similarity_df = generate_similarity_matrix(user_embedding, df)

In [5]:
# Show similarity scores between users
similarity_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,59936,59937,59938,59939,59940,59941,59942,59943,59944,59945
0,1.000000,0.402862,-0.057132,0.744822,-0.047582,0.092371,-0.301606,-0.252902,0.244712,-0.422904,...,-0.419586,0.351081,0.203758,-0.089630,-0.512311,-0.603729,0.881341,0.190992,0.884640,-0.086727
1,0.402862,1.000000,0.673523,0.251320,0.316383,0.394935,0.275800,0.265434,0.174826,0.435308,...,-0.190409,0.733489,0.785232,0.724801,0.024253,0.365281,0.475036,0.785896,0.563800,0.655437
2,-0.057132,0.673523,1.000000,-0.102936,0.396631,0.331526,0.481744,0.449703,0.024536,0.700343,...,0.053667,0.657962,0.703570,0.771470,0.311410,0.669914,0.038552,0.722382,0.134366,0.732014
3,0.744822,0.251320,-0.102936,1.000000,0.218367,0.224549,-0.119834,-0.024107,0.451047,-0.305540,...,-0.149203,0.285685,-0.026792,-0.248943,-0.252008,-0.562506,0.754436,0.008599,0.678194,-0.155555
4,-0.047582,0.316383,0.396631,0.218367,1.000000,0.669479,0.760354,0.813769,0.619304,0.650322,...,0.672918,0.500107,0.004472,0.088809,0.725776,0.290627,0.187130,0.077459,0.076768,0.283604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,-0.603729,0.365281,0.669914,-0.562506,0.290627,0.197476,0.564641,0.486804,-0.179641,0.799171,...,0.230997,0.268278,0.533709,0.762184,0.525000,1.000000,-0.513655,0.531318,-0.381359,0.696514
59942,0.881341,0.475036,0.038552,0.754436,0.187130,0.273466,-0.109767,-0.020456,0.379317,-0.215472,...,-0.230865,0.445759,0.211063,-0.064792,-0.304576,-0.513655,1.000000,0.223447,0.843424,-0.023088
59943,0.190992,0.785896,0.722382,0.008599,0.077459,0.130709,0.150237,0.135439,-0.196956,0.402727,...,-0.343759,0.539455,0.905891,0.867957,-0.064439,0.531318,0.223447,1.000000,0.379961,0.673854
59944,0.884640,0.563800,0.134366,0.678194,0.076768,0.197924,-0.171569,-0.148365,0.215013,-0.225393,...,-0.369054,0.503057,0.376140,0.122412,-0.403015,-0.381359,0.843424,0.379961,1.000000,0.122001


In [6]:
def get_top_matches(user_id, similarity_df, user_df, top_n=5, 
                    age_range=None, height_range=None, location=None, 
                    education=None, job=None, pets=None, offspring=None, 
                    speaks=None, ethnicity=None, diet=None, 
                    body_type=None, drinks=None, drugs=None, 
                    religion=None, sign=None, smokes=None, 
                    keyword_filter=None):
    """
    Returns top N matches for a given user, with optional flexible filters.

    Parameters:
        user_id (int): ID of the user to find matches for.
        similarity_df (pd.DataFrame): Pairwise similarity matrix.
        user_df (pd.DataFrame): Original user data.
        top_n (int): Number of matches to return.
        Various filters: Accepts category names or partial text matching.

    Returns:
        pd.DataFrame: Top N matching users.
    """

    # Make a copy of the user_df to avoid modifying the original
    user_df = user_df.copy()

    # Get user's gender and orientation
    user_sex = user_df.loc[user_id, "sex"]
    user_orientation = user_df.loc[user_id, "orientation"]

    # Define valid matches based on gender + orientation
    if user_sex == "male":
        if user_orientation == "straight":
            valid_matches = user_df[(user_df["sex"] == "female") & 
                                    (user_df["orientation"].isin(["straight", "bisexual"]))]
        elif user_orientation == "gay":
            valid_matches = user_df[(user_df["sex"] == "male") & 
                                    (user_df["orientation"].isin(["gay", "bisexual"]))]
        elif user_orientation == "bisexual":
            valid_matches = user_df[((user_df["sex"] == "female") & 
                                     (user_df["orientation"].isin(["straight", "bisexual"]))) |
                                    ((user_df["sex"] == "male") & 
                                     (user_df["orientation"].isin(["gay", "bisexual"])))]
    elif user_sex == "female":
        if user_orientation == "straight":
            valid_matches = user_df[(user_df["sex"] == "male") & 
                                    (user_df["orientation"].isin(["straight", "bisexual"]))]
        elif user_orientation == "gay":
            valid_matches = user_df[(user_df["sex"] == "female") & 
                                    (user_df["orientation"].isin(["gay", "bisexual"]))]
        elif user_orientation == "bisexual":
            valid_matches = user_df[((user_df["sex"] == "male") & 
                                     (user_df["orientation"].isin(["straight", "bisexual"]))) |
                                    ((user_df["sex"] == "female") & 
                                     (user_df["orientation"].isin(["gay", "bisexual"])))]
    else:
        return pd.DataFrame()  # No valid matches

    # Apply Numeric Filters
    if age_range:
        valid_matches = valid_matches[(valid_matches["age"] >= age_range[0]) & 
                                      (valid_matches["age"] <= age_range[1])]

    if height_range:
        valid_matches = valid_matches[(valid_matches["height"] >= height_range[0]) & 
                                      (valid_matches["height"] <= height_range[1])]

    # Flexible Location Filter (city, state, country)
    if location:
        valid_matches = valid_matches[valid_matches["location"].str.contains(location, case=False, na=False)]

    # Flexible Text-Based Filters
    text_filters = {
        "education": education, "job": job, "speaks": speaks, "ethnicity": ethnicity
    }
    
    for column, value in text_filters.items():
        if value:
            valid_matches = valid_matches[valid_matches[column].str.contains(value, case=False, na=False)]

    # *🔹 Fuzzy Diet Matching*
    if diet:
        diet_synonyms = {
            "vegan": ["vegan", "vegetarian", "mostly vegetarian"],
            "vegetarian": ["vegetarian", "mostly vegetarian"],
            "pescatarian": ["pescatarian", "mostly pescatarian"],
            "halal": ["halal", "mostly halal"],
            "kosher": ["kosher", "mostly kosher"]
        }
        
        valid_matches = valid_matches[valid_matches["diet"].apply(
            lambda x: any(diet.lower() in diet_synonyms.get(d, [d.lower()]) for d in str(x).split(", ")) 
            if pd.notna(x) else False
        )]

    # *🔹 Flexible Pet Preferences*
    if pets:
        for pet_pref in pets:
            if pet_pref == "likes dogs":
                valid_matches = valid_matches[
                    valid_matches["pets"].str.contains("has dogs|likes dogs", case=False, na=False)
                ]
            elif pet_pref == "dislikes dogs":
                valid_matches = valid_matches[
                    valid_matches["pets"].str.contains("dislikes dogs", case=False, na=False)
                ]
            elif pet_pref == "likes cats":
                valid_matches = valid_matches[
                    valid_matches["pets"].str.contains("has cats|likes cats", case=False, na=False)
                ]
            elif pet_pref == "dislikes cats":
                valid_matches = valid_matches[
                    valid_matches["pets"].str.contains("dislikes cats", case=False, na=False)
                ]

    # *🔹 Flexible Offspring (Kids) Preferences*
    if offspring:
        if "likes kids" in offspring:
            valid_matches = valid_matches[valid_matches["offspring"].str.contains("has kids|wants kids", case=False, na=False)]
        if "dislikes kids" in offspring:
            valid_matches = valid_matches[~valid_matches["offspring"].str.contains("has kids|wants kids", case=False, na=False)]
        if "neutral about kids" in offspring:
            valid_matches = valid_matches[valid_matches["offspring"].isna()]

    # Exact Match Filters (Categories)
    category_filters = {
        "body_type": body_type, "drinks": drinks, "drugs": drugs,
        "religion": religion, "sign": sign, "smokes": smokes
    }

    for column, value in category_filters.items():
        if value:
            valid_matches = valid_matches[valid_matches[column] == value]

    # Get valid user IDs
    valid_user_ids = valid_matches.index

    # Get similarity scores for valid matches
    filtered_similarities = similarity_df.loc[user_id, valid_user_ids]

    # Apply keyword-based filtering (if provided)
    if keyword_filter:
        keyword_filtered_users = valid_matches[
            valid_matches["essay_all"].str.contains(keyword_filter, case=False, na=False)
        ].index
        filtered_similarities = filtered_similarities.loc[keyword_filtered_users]

    # Return top N matches
    top_matches = filtered_similarities.sort_values(ascending=False).head(top_n)
    
    return user_df.loc[top_matches.index].assign(similarity_score=top_matches.values)

In [7]:
# *Example Usage*
user_id = 12345
filtered_matches = get_top_matches(
    user_id, similarity_df, df, top_n=10, 
    age_range=(25, 35), 
    height_range=(160, 180), 
    location="California",
    education="university", 
    job="engineer", 
    pets=["likes dogs", "dislikes cats"], 
    # ethnicity="Asian", 
    # diet="vegan", 
    # offspring=["neutral to kids"], 
    # keyword_filter="love hiking"
)

display(df.loc[user_id].to_frame().T)
display(filtered_matches)

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,height,job,location,offspring,pets,religion,sign,smokes,speaks,essay_all
12345,25.0,single,male,straight,fit,mostly anything,often,sometimes,graduated from masters program,white,175.0,sales / marketing / biz dev,"san francisco, california","doesn't have kids, but wants them",no pets and neutral to pets,christianity but not too serious about it,aries but it doesn't matter,sometimes,"english (fluently), french (fluently), italian...",hi ! i come from france and just moved in san ...


Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,job,location,offspring,pets,religion,sign,smokes,speaks,essay_all,similarity_score
59824,26.0,single,female,straight,thin,mostly anything,socially,never,graduated from college/university,white,...,science / tech / engineering,"san mateo, california","doesn't have kids, but might want them",likes dogs and dislikes cats,christianity and very serious about it,aquarius but it doesn't matter,no,"english (fluently), french (poorly)",i moved to the bay area from indiana less than...,0.687287
48258,25.0,seeing someone,female,straight,average,strictly anything,often,never,graduated from college/university,white,...,science / tech / engineering,"san francisco, california",no kids and neutral to kids,likes dogs and dislikes cats,agnosticism and very serious about it,gemini and it's fun to think about,when drinking,"english (fluently), c++ (poorly), french (okay)",hello world! this feels very much like a colle...,0.668369
35354,29.0,single,female,straight,average,anything,socially,never,graduated from college/university,white,...,science / tech / engineering,"san francisco, california","doesn't have kids, but wants them",likes dogs and dislikes cats,catholicism and somewhat serious about it,virgo,no,english,"i'm a professional engineer by trade, but i sp...",0.656392
1085,26.0,single,female,straight,average,anything,socially,never,graduated from college/university,"native american, hispanic / latin",...,science / tech / engineering,"san francisco, california",doesn't want kids,likes dogs and dislikes cats,atheism and very serious about it,libra but it doesn't matter,no,"english (fluently), spanish (fluently), italia...","i'm verrrry bad at self-summaries. just, fair ...",0.632605
31041,26.0,single,female,straight,average,anything,socially,never,graduated from college/university,asian,...,science / tech / engineering,"san francisco, california","doesn't have kids, but wants them",likes dogs and dislikes cats,catholicism but not too serious about it,aries but it doesn't matter,when drinking,"english (fluently), tagalog (fluently)",i'm a nerd with glasses and the occasional zit...,0.604481
8965,30.0,single,female,straight,average,mostly anything,often,never,graduated from college/university,white,...,science / tech / engineering,"san francisco, california","doesn't have kids, but wants them",dislikes dogs and dislikes cats,christianity but not too serious about it,gemini but it doesn't matter,no,english (fluently),"originally from ireland, i'm living in san fra...",0.553098
25519,25.0,single,female,straight,skinny,anything,socially,never,graduated from college/university,"middle eastern, hispanic / latin, white, other",...,science / tech / engineering,"san francisco, california",doesn't have kids,likes dogs and dislikes cats,irreligion,sagittarius and it's fun to think about,no,"english, chinese, other",discovering a new city and learning how to cod...,0.518423
13047,25.0,single,female,straight,fit,anything,socially,never,graduated from college/university,"middle eastern, white",...,science / tech / engineering,"san francisco, california",no kids and neutral to kids,dislikes dogs and dislikes cats,judaism and laughing about it,taurus and it's fun to think about,no,"english (fluently), hebrew (okay)",here's my best shot at the ambitious task of f...,0.48452
26625,27.0,single,female,straight,full figured,mostly anything,socially,never,graduated from college/university,asian,...,science / tech / engineering,"south san francisco, california",no kids and neutral to kids,likes dogs and dislikes cats,irreligion,cancer and it's fun to think about,yes,english,call it what you will but i never liked descri...,0.445736
14433,33.0,single,female,straight,average,mostly anything,socially,never,graduated from college/university,white,...,science / tech / engineering,"oakland, california",no kids and neutral to kids,has dogs and dislikes cats,irreligion,leo and it's fun to think about,no,english,i just moved home to the bay after living in a...,0.397437
