In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import umap
import plotly.express as px

In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

In [3]:
movies = pd.read_excel('data/movies_df.xlsx')
movies_csv = pd.read_csv('data/movies.csv', encoding='latin-1')

# Merge movies_new_df with movies_df on movieId
movies_with_titles = movies.merge(movies_csv, on='movieId', how='left')

In [4]:
movie_titles = list(movies_with_titles["title"])

In [5]:
movies.head(3)

Unnamed: 0,movieId,budget,original_language,popularity,revenue,runtime,vote_average,vote_count,overview,tags,overview_embeddings
0,1,30000000,en,102.775,394400000,81,7.97,17264,"Led by Woody, Andy's toys live happily in his ...","['martial arts', 'jealousy', 'friendship', 'bu...","[0.056440137, 0.059867144, -0.010645777, 0.024..."
1,2,65000000,en,15.21,262821940,104,7.238,9887,When siblings Judy and Peter discover an encha...,"['giant insect', 'board game', 'jungle', 'disa...","[0.04063326, -0.012308196, -0.048614927, 0.051..."
2,3,25000000,en,12.835,71500000,101,6.494,350,A family wedding reignites the ancient feud be...,"['fishing', 'halloween', 'sequel', 'old man', ...","[-0.013074059, 0.025070313, -0.02172346, -0.00..."


In [6]:
movie_embeddings = movies["overview_embeddings"]

In [7]:
movie_embeddings = movie_embeddings.to_numpy()
movie_embeddings = [entry[1:-1].split(", ") for entry in movie_embeddings]

In [8]:
len(movie_embeddings[0])

768

In [11]:
# Initialize the UMAP reducer and fit the embeddings
#We reduce to 15 components for the actual task
reducer15 = umap.UMAP(n_components=15, n_neighbors=15)
umap_embeddings_15 = reducer15.fit_transform(movie_embeddings)

#We also reduce to 3 to visualize the embeddings
umap_embeddings_3 = reducer3.fit_transform(movie_embeddings)
reducer3 = umap.UMAP(n_components=3, n_neighbors=15)


In [13]:
print(umap_embeddings_15.shape)
print(umap_embeddings_3.shape)

(9687, 15)
(9687, 3)


In [16]:
embeddings_reduced = []
for emb in umap_embeddings_15:
    embeddings_reduced.append(list(emb))

movies["overview_embeddings"] = embeddings_reduced

In [18]:
len(movies.overview_embeddings[0])

15

In [19]:
movies.to_excel("movies_df_reduced_embeddings.xlsx")

In [22]:
umap_embeddings_3

array([[11.437242  ,  8.5169525 ,  5.2561283 ],
       [ 8.658722  ,  9.364742  ,  2.3766122 ],
       [ 9.4873085 ,  7.9040766 ,  0.9036212 ],
       ...,
       [ 8.562596  ,  9.564933  ,  0.67334026],
       [ 7.569921  ,  7.978973  ,  2.070473  ],
       [12.384399  ,  7.586201  ,  5.471021  ]], dtype=float32)

In [24]:
import plotly.express as px

plot_df = pd.DataFrame(umap_embeddings_3)
plot_df = plot_df.rename(columns={0:"x", 1:"y", 2:"z"})
plot_df["title"] = movie_titles
plot_df

Unnamed: 0,x,y,z,title
0,11.437242,8.516953,5.256128,Toy Story (1995)
1,8.658722,9.364742,2.376612,Jumanji (1995)
2,9.487309,7.904077,0.903621,Grumpier Old Men (1995)
3,9.290467,10.275407,0.153749,Waiting to Exhale (1995)
4,11.547864,7.530029,10.356348,Father of the Bride Part II (1995)
...,...,...,...,...
9682,12.208331,7.581473,10.382408,Black Butler: Book of the Atlantic (2017)
9683,11.958322,7.498209,9.987509,No Game No Life: Zero (2017)
9684,8.562596,9.564933,0.673340,Flint (2017)
9685,7.569921,7.978973,2.070473,Bungo Stray Dogs: Dead Apple (2018)


In [25]:
fig = px.scatter_3d(plot_df, x='x', y='y', z='z', opacity=0.5, custom_data=[movie_titles])

fig.update_traces(
    hovertemplate="<br>".join([
        "Title: %{customdata[0]}"
    ])
)


In [28]:

fig = px.scatter_3d(x=umap_embeddings[:,0], y=umap_embeddings[:,1], z=umap_embeddings[:,2], opacity=0.5, custom_data=[movie_titles])

