In [295]:
import json
import os
import datetime
import math
import numpy as np
import pandas as pd
import psycopg2
from upstash_redis import Redis
from apscheduler.schedulers.blocking import BlockingScheduler
from sklearn.metrics.pairwise import cosine_similarity

# Load environment variables
from dotenv import load_dotenv
load_dotenv(".env.py")

# ----------------------------------------------------------------
# 1️⃣ Database & Redis
# ----------------------------------------------------------------

# Connect to PostgreSQL
conn = psycopg2.connect(os.getenv("DATABASE_URL"))
cursor = conn.cursor()

In [296]:
def rank_by_cosine_similarity(target_vector, comparison_vectors):
    """
    Ranks comparison vectors by their cosine similarity to a target vector.
    Outputs the 
    
    Parameters:
    target_vector: array-like of shape (1, n_features) --The reference vector for comparison
    comparison_vectors: array-like of shape (1:video_id + n_samples, n_features) --The vectors to compare against the target
        
    Returns:
    list of tuples: (index, similarity_score) sorted by similarity in descending order
    """
    # Cast input to arrays for easy manipulation
    target_vector = np.array(target_vector).reshape(1, -1) # cast as array with 1 row and any numbrer of columns (-1)
    # extract video_id
    video_ids = comparison_vectors
    comparison_vectors = np.array(comparison_vectors)
    
    # Calculate cosine similarities
    # target_vector: shape (1,768)
    # comparison_vector: shape (#videos,768)
    # output: shape (1, #videos)
    similarities = cosine_similarity(target_vector, comparison_vectors)[0] # one row per row of the target
    print("similarities", similarities)

    map_video_score = zip(video_ids, similarities)
    print("map_video_score", map_video_score)

    ranked_similarities = sorted(map_video_score, key = lambda x: x[1], reverse=True)
    print("ranked_similarities", ranked_similarities)

    return ranked_similarities

In [297]:
def get_similarity_scores(target_vector, comparison_vectors):
  """
  gets cosine similarity to a target vector
  
  Parameters:
  user_embedding: array-like of shape (1, n_features) --The reference vector for comparison
  videos_embedding: array-like of shape (1:video_id + n_samples, n_features) --The vectors to compare against the target
      
  Returns:
  list of tuples: (video_id, similarity_score)
  """
  # print("LOG INPUT 1", target_vector)
  # print("LOG INPUT 2", comparison_vectors)

  # Cast input to arrays for easy manipulation
  target_vector_json = json.loads(target_vector)
  target_vector_arr = np.array(target_vector_json).reshape(1, -1) # cast as array with 1 row and any numbrer of columns (-1)

  # extract video_id
  video_ids = [comp[0] for comp in comparison_vectors]

  # extract videos embeddings
  comparison_vectors_json = [json.loads(comp[1]) for comp in comparison_vectors]
  comparison_vectors_arr = np.array(comparison_vectors_json)

  # Calculate cosine similarities
  # target_vector: shape (1,768)
  # comparison_vector: shape (#videos,768)
  # output: shape (1, #videos)
  # The 'X' parameter of cosine_similarity must be an array-like or a sparse matrix.
  # parameters must be json objecrts cast into array
  similarities = cosine_similarity(target_vector_arr, comparison_vectors_arr)[0] # one row per row of the target
  # print("similarities", similarities)

  map_video_score = list(zip(video_ids, similarities))

  # print("ranked_similarities", ranked_similarities)

  return map_video_score

# for each user: rank cosine similarity with existing videos --videos.embeddings
# for(id in user_ids):
map_user_ranked_similarities = {}
# user_ids: [(id, embedding)]
for user_id, user_embedding in user_ids:
    # map video_id to scores
    map_video_scores = get_similarity_scores(user_embedding, videos_embedding)
    # rank scores
    ranked_scores = sorted(map_video_scores, key = lambda x: x[1], reverse=True)
    # returned ranked list
    map_user_ranked_similarities[user_id] = ranked_scores

    print(f"Similarity scores for user {user_id}: {[f'{tup[1]:.2f}' for tup in map_user_ranked_similarities[user_id]]}")

Similarity scores for user 4e5ea187-ae0e-42e4-997c-67ddb1b28270: ['0.60', '0.59', '0.46', '0.45', '0.45', '0.45', '0.45', '0.44']
Similarity scores for user 6fb5a42d-8116-4e60-81d2-b62210f245b3: ['0.85', '0.74', '0.45', '0.42', '0.41', '0.40', '0.40', '0.39']
Similarity scores for user 0c7fda0a-078b-4bcd-a595-21ec1cfe9ea0: ['0.63', '0.62', '0.61', '0.60', '0.59', '0.47', '0.41', '0.40']
Similarity scores for user e40cf1de-5ad0-4e2c-8e5e-d35e062064f8: ['0.85', '0.74', '0.45', '0.42', '0.41', '0.40', '0.40', '0.39']


In [298]:
# use json.lodas in a stringified json! '[....]'

# get users and profile embeddings
cursor.execute("SELECT id, embeddings FROM users")
user_ids =  cursor.fetchall()

# get all video embeddings
cursor.execute("SELECT id, embeddings FROM videos WHERE embeddings IS NOT NULL")
videos_embedding = cursor.fetchall() # (#videos, #embeddings + 1)

# get test user
test_user_emb = user_ids[0][1] # stringified json
test_user_emb_json = json.loads(test_user_emb)
test_user_emb_arr = np.array(test_user_emb_json).reshape(1,-1) # load json and turn into an array

# get test video
test_videos_ids = [ve[0] for ve in videos_embedding][0]

# get test video embedding
# test_videos_embs = np.array([json.loads(ve[1]) for ve in videos_embedding]) # Convert to array of embeddings
test_videos_embs = [ve[1] for ve in videos_embedding] # [stringified json]
test_videos_embs_json = [json.loads(ve[1]) for ve in videos_embedding] # [json]
test_videos_embs_arr = np.array(test_videos_embs_json)

# The 'X' parameter of cosine_similarity must be an array-like or a sparse matrix.
# poarameters must be json objecrts cast into array
cosine_similarity(test_user_emb_arr, test_videos_embs_arr)

array([[0.4513698 , 0.59249111, 0.46373789, 0.43788369, 0.60124828,
        0.45461222, 0.44625027, 0.44931188]])

In [299]:
import json

# get users and profile embeddings
cursor.execute("SELECT id, embeddings FROM users")
user_ids =  cursor.fetchall()

# for each user: rank cosine similarity with existing videos --videos.embeddings
# for(id in user_ids):
map_user_ranked_similarities = {}
for user_id, embeddings in user_ids:
    cursor.execute("SELECT id, embeddings FROM videos")
    video_embeddings = cursor.fetchall()
    [json.loads(video[1]) for video in video_ids if video[1] is not None]
    print("VEMB", len(video_embeddings), [len(video_embeddings[i]) for i in np.arange(1,8,1)])
    map_user_ranked_similarities[user_id] = rank_by_cosine_similarity(json.loads(embeddings), video_embeddings)
    print(f"Recommendations for user {user_id}: {map_user_ranked_similarities[user_id]}")

VEMB 10 [2, 2, 2, 2, 2, 2, 2]


ValueError: could not convert string to float: '67dbacfa-6685-45da-96ee-0709e796af63'

In [61]:
map_user_ranked_similarities

{'4e5ea187-ae0e-42e4-997c-67ddb1b28270': [(4, np.float64(0.601248277626046)),
  (1, np.float64(0.5924911087881568)),
  (2, np.float64(0.4637378879339542)),
  (5, np.float64(0.45461221852408396)),
  (0, np.float64(0.4513697983750683)),
  (7, np.float64(0.4493118782720349)),
  (6, np.float64(0.4462502749271937)),
  (3, np.float64(0.4378836914040903))],
 '6fb5a42d-8116-4e60-81d2-b62210f245b3': [(1, np.float64(0.8466133985740051)),
  (4, np.float64(0.7378676681956524)),
  (5, np.float64(0.45434188314640883)),
  (3, np.float64(0.42354532664698324)),
  (2, np.float64(0.4074677863587103)),
  (6, np.float64(0.40332055528695165)),
  (0, np.float64(0.4011985006337363)),
  (7, np.float64(0.3908751415885182))],
 '0c7fda0a-078b-4bcd-a595-21ec1cfe9ea0': [(0, np.float64(0.6254746761975318)),
  (4, np.float64(0.6150461009496336)),
  (7, np.float64(0.6061571771667127)),
  (1, np.float64(0.5984827914142251)),
  (6, np.float64(0.590334801804286)),
  (5, np.float64(0.4718988899587716)),
  (2, np.float64(0