# Import Library

In [1]:
# data manipulation
import numpy as np
import pandas as pd
# calculation of cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# Load Embedding

In [2]:
# Load user embeddings and data
user_embedding = np.load('okcupid_profiles_preprocessed.npy')

# Calculate Cosine Similarity

In [3]:
def generate_similarity_matrix(embeddings):
    """
    Generate similarity matrix from data
    :param data: pandas DataFrame
    :return: similarity matrix
    """
    # Assume user_embeddings is an (N, D) matrix, where:
    # - N = number of users
    # - D = embedding dimension

    similarity_matrix = cosine_similarity(embeddings)

    return similarity_matrix

# Generate Similiarity Matrix

In [4]:
# define user_id from 0 to N-1
user_id = range(user_embedding.shape[0])

# generate similarity matrix and save it as 'similarity_matrix.npy'
similarity_matrix = generate_similarity_matrix(user_embedding)

# turn similarity matrix to pandas DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=user_id, columns=user_id)

In [5]:
# Show similarity scores between users
similarity_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,59936,59937,59938,59939,59940,59941,59942,59943,59944,59945
0,1.000000,0.402862,-0.056881,0.744822,-0.047582,0.092371,-0.301606,-0.252902,0.244712,-0.422904,...,-0.419586,0.351081,0.203758,-0.089630,-0.512311,-0.603729,0.881341,0.190992,0.884640,-0.086727
1,0.402862,1.000000,0.670489,0.251320,0.316383,0.394935,0.275800,0.265435,0.174826,0.435308,...,-0.190408,0.733489,0.785232,0.724801,0.024253,0.365281,0.475036,0.785896,0.563800,0.655437
2,-0.056881,0.670489,1.000000,-0.104231,0.398978,0.330423,0.484169,0.452677,0.016824,0.703009,...,0.046599,0.648937,0.703182,0.766781,0.310866,0.669814,0.038316,0.722044,0.126855,0.727102
3,0.744822,0.251320,-0.104231,1.000000,0.218367,0.224549,-0.119834,-0.024107,0.451047,-0.305540,...,-0.149203,0.285685,-0.026792,-0.248943,-0.252008,-0.562506,0.754436,0.008599,0.678194,-0.155555
4,-0.047582,0.316383,0.398978,0.218367,1.000000,0.669479,0.760354,0.813769,0.619304,0.650322,...,0.672918,0.500107,0.004472,0.088809,0.725776,0.290627,0.187130,0.077459,0.076768,0.283604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,-0.603729,0.365281,0.669814,-0.562506,0.290627,0.197476,0.564641,0.486804,-0.179641,0.799171,...,0.230997,0.268278,0.533709,0.762184,0.525000,1.000000,-0.513655,0.531318,-0.381359,0.696514
59942,0.881341,0.475036,0.038316,0.754436,0.187130,0.273466,-0.109767,-0.020456,0.379317,-0.215471,...,-0.230865,0.445759,0.211063,-0.064792,-0.304576,-0.513655,1.000000,0.223447,0.843424,-0.023088
59943,0.190992,0.785896,0.722044,0.008599,0.077459,0.130709,0.150237,0.135439,-0.196956,0.402727,...,-0.343759,0.539456,0.905891,0.867957,-0.064439,0.531318,0.223447,1.000000,0.379961,0.673854
59944,0.884640,0.563800,0.126855,0.678194,0.076768,0.197924,-0.171570,-0.148365,0.215013,-0.225393,...,-0.369054,0.503057,0.376140,0.122412,-0.403015,-0.381359,0.843424,0.379961,1.000000,0.122001


# Save Similarity Matrix as .npy

In [6]:
# Save similarity matrix to file
np.save('similarity_matrix.npy', similarity_matrix)

In [None]:
# Check if the similarity matrix is saved correctly
(np.load("similarity_matrix.npy") == similarity_matrix).all()

np.True_