In [1]:
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine
import pandas as pd

Lets try cosine similarity for the 4 Avengers movies. All movies should give high similarity scores for each other. Movie 1 and 2 should be very similar, movie 3 and 4 should be very similar

In [2]:
load_dotenv()

DATABASE_URL = os.getenv("DATABASE_URL")
engine = create_engine(DATABASE_URL)

query = 'SELECT "movieId", "combinedVector" FROM "MovieFeatureVector" WHERE "movieId" IN (24428, 99861, 299536, 299534)'
df = pd.read_sql(query, engine)

print(df)

   movieId                                     combinedVector
0    24428  [b'P', b'K', b'\x03', b'\x04', b'-', b'\x00', ...
1    99861  [b'P', b'K', b'\x03', b'\x04', b'-', b'\x00', ...
2   299534  [b'P', b'K', b'\x03', b'\x04', b'-', b'\x00', ...
3   299536  [b'P', b'K', b'\x03', b'\x04', b'-', b'\x00', ...


In [3]:
import io
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

sparse_vectors = []

for b in df["combinedVector"]:
    bytes_data = b''.join(b)
    
    buffer = io.BytesIO(bytes_data)
    vector = sparse.load_npz(buffer)
    
    sparse_vectors.append(vector)

X = sparse.vstack(sparse_vectors)

similarity_matrix = cosine_similarity(X, dense_output=False)

sim_df = pd.DataFrame(similarity_matrix.toarray())
print(sim_df)

          0         1         2         3
0  1.000000  0.713407  0.583287  0.559976
1  0.713407  1.000000  0.588403  0.571159
2  0.583287  0.588403  1.000000  0.603630
3  0.559976  0.571159  0.603630  1.000000


Lets try cosine similarity for 4 movies which have some similarities. We will choose Inception, The Dark Knight, Shutter Island, The Departed

In [9]:
query = 'SELECT "movieId", "combinedVector" FROM "MovieFeatureVector" WHERE "movieId" IN (27205, 155, 11324, 1422)'
df = pd.read_sql(query, engine)

print(df)

   movieId                                     combinedVector
0      155  [b'P', b'K', b'\x03', b'\x04', b'-', b'\x00', ...
1     1422  [b'P', b'K', b'\x03', b'\x04', b'-', b'\x00', ...
2    11324  [b'P', b'K', b'\x03', b'\x04', b'-', b'\x00', ...
3    27205  [b'P', b'K', b'\x03', b'\x04', b'-', b'\x00', ...


In [10]:
import io
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

sparse_vectors = []

for b in df["combinedVector"]:
    bytes_data = b''.join(b)
    
    buffer = io.BytesIO(bytes_data)
    vector = sparse.load_npz(buffer)
    
    sparse_vectors.append(vector)

X = sparse.vstack(sparse_vectors)

similarity_matrix = cosine_similarity(X, dense_output=False)

sim_df = pd.DataFrame(similarity_matrix.toarray())
print(sim_df)

          0         1         2         3
0  1.000000  0.452518  0.304732  0.246861
1  0.452518  1.000000  0.413861  0.036247
2  0.304732  0.413861  1.000000  0.037317
3  0.246861  0.036247  0.037317  1.000000


Lets try cosine similarity for the 4 movies which are very dissimilar. We will choose Interstallar, Fight Club, Django Unchained, The Wolf of Wall Street

In [11]:
query = 'SELECT "movieId", "combinedVector" FROM "MovieFeatureVector" WHERE "movieId" IN (157336, 550, 68718, 106646)'
df = pd.read_sql(query, engine)

print(df)

   movieId                                     combinedVector
0      550  [b'P', b'K', b'\x03', b'\x04', b'-', b'\x00', ...
1    68718  [b'P', b'K', b'\x03', b'\x04', b'-', b'\x00', ...
2   106646  [b'P', b'K', b'\x03', b'\x04', b'-', b'\x00', ...
3   157336  [b'P', b'K', b'\x03', b'\x04', b'-', b'\x00', ...


In [12]:
import io
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

sparse_vectors = []

for b in df["combinedVector"]:
    bytes_data = b''.join(b)
    
    buffer = io.BytesIO(bytes_data)
    vector = sparse.load_npz(buffer)
    
    sparse_vectors.append(vector)

X = sparse.vstack(sparse_vectors)

similarity_matrix = cosine_similarity(X, dense_output=False)

sim_df = pd.DataFrame(similarity_matrix.toarray())
print(sim_df)

          0         1         2         3
0  1.000000  0.218660  0.209473  0.205440
1  0.218660  1.000000  0.197161  0.189085
2  0.209473  0.197161  1.000000  0.182555
3  0.205440  0.189085  0.182555  1.000000


good