In [2]:
!pip install faiss-cpu pandas pyarrow huggingface_hub



In [3]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_id="abhijit26/movie-vector-galaxy",
    repo_type="dataset",
    local_dir="data_full",
    local_dir_use_symlinks=False
)



Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

'/content/data_full'

In [4]:
import os
import pandas as pd
import numpy as np
import faiss

In [5]:
DATA_DIR = "data_full"

os.listdir(DATA_DIR)

['metadata.parquet',
 '.gitattributes',
 'faiss_index.faiss',
 'embeddings.npy',
 '.cache']

## Metadata - Movie Information

In [None]:
meta = pd.read_parquet(f"{DATA_DIR}/metadata.parquet")

In [None]:
print(f"meta shape {meta.shape}")
meta.columns.tolist()

In [None]:
meta.head(3)

In [None]:
meta.info(memory_usage="deep")

In [None]:
meta.isnull().sum().sort_values(ascending=False).head(18)

In [None]:
meta.describe(include="all")

## Embeddings - Stores Semantic meanning

In [None]:
emb = np.load(f"{DATA_DIR}/embeddings.npy")

In [None]:
emb.shape

In [None]:
emb.dtype

In [None]:
emb.nbytes / (1024**3)

In [None]:
emb[0][:10]

In [None]:
np.linalg.norm(emb[0])

## FAISS index - fast search
This file is built FROM embeddings.
faiss.swigfaiss_avx2.IndexFlatIP type of faiss

In [None]:
index = faiss.read_index(f"{DATA_DIR}/faiss_index.faiss")

In [None]:
type(index)

In [None]:
index.ntotal

In [None]:
index.d

In [None]:
movie_idx = 100

D, I = index.search(emb[movie_idx].reshape(1, -1), 5)

meta.iloc[I[0]][["title"]]

## development dataset creation

In [6]:
DATA_FULL = "data_full"
DATA_DEV = "data_dev"

os.makedirs(DATA_DEV, exist_ok=True)

meta = pd.read_parquet(f"{DATA_FULL}/metadata.parquet")
emb = np.load(f"{DATA_FULL}/embeddings.npy").astype("float32")

In [7]:
meta["imdb_votes"] = meta["imdb_votes"].fillna(0)
meta["imdb_votes"].describe()

Unnamed: 0,imdb_votes
count,1035695.0
mean,1177.696
std,20552.41
min,0.0
25%,0.0
50%,0.0
75%,38.0
max,2982772.0


In [8]:
DEV_SIZE = 20000

top_meta = (
    meta.sort_values("imdb_votes", ascending=False)
        .head(DEV_SIZE)
)

In [9]:
# to keep embedding align
indices = top_meta.index.to_numpy()
indices.sort()

In [10]:
meta_dev = meta.iloc[indices].reset_index(drop=True)

meta_dev.to_parquet(f"{DATA_DEV}/metadata.parquet")

meta_dev.shape

(20000, 18)

In [11]:
emb_dev = emb[indices]

np.save(f"{DATA_DEV}/embeddings.npy", emb_dev)

emb_dev.shape

(20000, 768)

In [12]:
dim = emb_dev.shape[1]

index_dev = faiss.IndexFlatIP(dim)
index_dev.add(emb_dev)

faiss.write_index(index_dev, f"{DATA_DEV}/faiss_index.faiss")

In [13]:
print(len(meta_dev))
print(emb_dev.shape[0])
print(index_dev.ntotal)

20000
20000
20000


In [14]:
movie_idx = 100

D, I = index_dev.search(
    emb_dev[movie_idx].reshape(1, -1),
    5
)

meta_dev.iloc[I[0]][["title", "imdb_votes"]]

Unnamed: 0,title,imdb_votes
100,Breaking the Waves,73056.0
6944,Niagara,20605.0
5308,The Edge of Love,19242.0
7242,Possession,47573.0
8067,Maurice,24774.0


In [15]:
import shutil
import os

output_filename = "data_dev"
zip_directory = "data_dev"

if os.path.exists(zip_directory):
    shutil.make_archive(output_filename, 'zip', zip_directory)
    print(f"Successfully created {output_filename}.zip")
else:
    print(f"Directory '{zip_directory}' not found. Please ensure it exists before zipping.")

Successfully created data_dev.zip


In [None]:
import shutil; import os; output_filename = "data_dev"; zip_directory = "data_dev"; shutil.make_archive(output_filename, 'zip', zip_directory) if os.path.exists(zip_directory) else print(f"Directory '{zip_directory}' not found. Please ensure it exists before zipping.")

In [16]:
DATA_DEV = "data_dev"

meta = pd.read_parquet(f"{DATA_DEV}/metadata.parquet")
emb = np.load(f"{DATA_DEV}/embeddings.npy")
index = faiss.read_index(f"{DATA_DEV}/faiss_index.faiss")

print("Loaded:", len(meta))

Loaded: 20000


In [20]:
title_to_idx = {
    title.lower(): idx
    for idx, title in enumerate(meta["title"])
}

def recommend_similar(movie_title, k=5):

    movie_title = movie_title.lower()

    if movie_title not in title_to_idx:
        print("Movie not found!")
        return

    idx = title_to_idx[movie_title]

    query_vec = emb[idx].reshape(1, -1)

    # +1 because first result is the movie itself
    D, I = index.search(query_vec, k + 1)

    results = meta.iloc[I[0][1:]][[
        "title",
        "year",
        "imdb_votes"
    ]]

    return results.reset_index(drop=True)

recommend_similar("tenet")

Unnamed: 0,title,year,imdb_votes
0,The Professional,1981,17963.0
1,Surrogates,2009,181793.0
2,Tomorrow Never Dies,1997,208143.0
3,The Fifth Element,1997,519425.0
4,The Matrix,1999,2116264.0
