In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# read data and load similarity matrix
df = pd.read_csv('okcupid_profiles_cleaned.csv')
user_ids = df.index
similarity_matrix = np.load('similarity_matrix.npy')

# turn similarity matrix into a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=user_ids, columns=user_ids)

In [3]:
similarity_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,59936,59937,59938,59939,59940,59941,59942,59943,59944,59945
0,1.000000,0.402862,-0.057132,0.744822,-0.047582,0.092371,-0.301606,-0.252902,0.244712,-0.422904,...,-0.419586,0.351081,0.203758,-0.089630,-0.512311,-0.603729,0.881341,0.190992,0.884640,-0.086727
1,0.402862,1.000000,0.673523,0.251320,0.316383,0.394935,0.275800,0.265434,0.174826,0.435308,...,-0.190409,0.733489,0.785232,0.724801,0.024253,0.365281,0.475036,0.785896,0.563800,0.655437
2,-0.057132,0.673523,1.000000,-0.102936,0.396631,0.331526,0.481744,0.449703,0.024536,0.700343,...,0.053667,0.657962,0.703570,0.771470,0.311410,0.669914,0.038552,0.722382,0.134366,0.732014
3,0.744822,0.251320,-0.102936,1.000000,0.218367,0.224549,-0.119834,-0.024107,0.451047,-0.305540,...,-0.149203,0.285685,-0.026792,-0.248943,-0.252008,-0.562506,0.754436,0.008599,0.678194,-0.155555
4,-0.047582,0.316383,0.396631,0.218367,1.000000,0.669479,0.760354,0.813769,0.619304,0.650322,...,0.672918,0.500107,0.004472,0.088809,0.725776,0.290627,0.187130,0.077459,0.076768,0.283604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,-0.603729,0.365281,0.669914,-0.562506,0.290627,0.197476,0.564641,0.486804,-0.179641,0.799171,...,0.230997,0.268278,0.533709,0.762184,0.525000,1.000000,-0.513655,0.531318,-0.381359,0.696514
59942,0.881341,0.475036,0.038552,0.754436,0.187130,0.273466,-0.109767,-0.020456,0.379317,-0.215472,...,-0.230865,0.445759,0.211063,-0.064792,-0.304576,-0.513655,1.000000,0.223447,0.843424,-0.023088
59943,0.190992,0.785896,0.722382,0.008599,0.077459,0.130709,0.150237,0.135439,-0.196956,0.402727,...,-0.343759,0.539455,0.905891,0.867957,-0.064439,0.531318,0.223447,1.000000,0.379961,0.673854
59944,0.884640,0.563800,0.134366,0.678194,0.076768,0.197924,-0.171569,-0.148365,0.215013,-0.225393,...,-0.369054,0.503057,0.376140,0.122412,-0.403015,-0.381359,0.843424,0.379961,1.000000,0.122001


In [32]:
def find_similar_users(user_id, similarity_matrix, top_k=10):
    """
    Find top_k most similar users for a given user_id based on cosine similarity matrix.
    """
    user_index = np.where(user_ids == user_id)[0][0]  # Get index
    scores = similarity_matrix[user_index]  # Get similarity scores

    # Make a copy of the user_df to avoid modifying the original
    user_df = df.copy()

    # Get user's gender and orientation
    user_sex = user_df.loc[user_id, "sex"]
    user_orientation = user_df.loc[user_id, "orientation"]

    if user_sex == "male":
        if user_orientation == "straight":
            valid_matches = user_df[(user_df["sex"] == "female") & 
                                    (user_df["orientation"].isin(["straight", "bisexual"]))]
        elif user_orientation == "gay":
            valid_matches = user_df[(user_df["sex"] == "male") & 
                                    (user_df["orientation"].isin(["gay", "bisexual"]))]
        elif user_orientation == "bisexual":
            valid_matches = user_df[((user_df["sex"] == "female") & 
                                     (user_df["orientation"].isin(["straight", "bisexual"]))) |
                                    ((user_df["sex"] == "male") & 
                                     (user_df["orientation"].isin(["gay", "bisexual"])))]
    elif user_sex == "female":
        if user_orientation == "straight":
            valid_matches = user_df[(user_df["sex"] == "male") & 
                                    (user_df["orientation"].isin(["straight", "bisexual"]))]
        elif user_orientation == "gay":
            valid_matches = user_df[(user_df["sex"] == "female") & 
                                    (user_df["orientation"].isin(["gay", "bisexual"]))]
        elif user_orientation == "bisexual":
            valid_matches = user_df[((user_df["sex"] == "male") & 
                                     (user_df["orientation"].isin(["straight", "bisexual"]))) |
                                    ((user_df["sex"] == "female") & 
                                     (user_df["orientation"].isin(["gay", "bisexual"])))]
    else:
        return pd.DataFrame()  # No valid matches

    top_matches = np.argsort(scores[valid_matches.index])[::-1][1:top_k+1]  # Exclude self, get top matches
    return user_ids[top_matches]  # Return user IDs

In [33]:
def match_overlap_score(user1, user2, similarity_matrix, top_k=10):
    """
    Compare the top matches of two users and return the overlap percentage.
    """
    matches1 = set(find_similar_users(user1, similarity_matrix, top_k))
    matches2 = set(find_similar_users(user2, similarity_matrix, top_k))
    overlap = len(matches1.intersection(matches2)) / top_k  # Percentage overlap
    return overlap

In [34]:
import random

def evaluate_recommendation_system(similarity_matrix, user_ids, test_cases=100, top_k=10):
    """
    Evaluates recommendation consistency by checking overlap between similar users.
    """
    overlap_scores = []
    total_users = len(user_ids)
    
    for _ in range(test_cases):
        # Randomly select two users
        user1, user2 = random.sample(list(user_ids), 2)
        
        # Ensure they are actually similar (cosine similarity > 0.8)
        similarity = similarity_matrix[np.where(user_ids == user1)[0][0], 
                                       np.where(user_ids == user2)[0][0]]
        if similarity < 0.8:
            continue  # Skip if users are not similar enough
        
        # Compute match overlap score
        overlap = match_overlap_score(user1, user2, similarity_matrix, top_k)
        overlap_scores.append(overlap)

    avg_overlap = np.mean(overlap_scores)
    return avg_overlap

In [35]:
# Evaluate the recommendation system
avg_overlap_score = evaluate_recommendation_system(similarity_matrix, user_ids)
print(f"✅ Average Top-{10} Match Overlap Score: {avg_overlap_score:.2f}")

✅ Average Top-10 Match Overlap Score: 0.03


In [36]:
print(similarity_matrix.min(), similarity_matrix.max(), similarity_matrix.mean())

-0.9822719601682814 1.0000000000000016 0.3068698467119406


In [37]:
[i for i in user_ids[:10]]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [38]:
test_user_index = 0  # Pick a random user
top_matches = np.argsort(-similarity_matrix[test_user_index])[:10]
print(user_ids[test_user_index], [user_ids[i] for i in top_matches])

0 [0, 27351, 42758, 31254, 28946, 43147, 17122, 27772, 50770, 8562]
