In [53]:
import pandas as pd
import os

In [54]:
movie_df = pd.read_csv(os.path.join("../../db_tables", "Movie_rows.csv"))

In [55]:
movie_df = movie_df.sort_values(by="voteCount", ascending=False).reset_index(drop=True)
movie_df = movie_df[:5000]
movie_df

Unnamed: 0,id,title,originalTitle,overview,tagline,status,releaseDate,runtime,budget,revenue,adult,homepage,imdbId,popularity,voteAverage,voteCount,posterPath,backdropPath
0,27205,Inception,Inception,"Cobb, a skilled thief who commits corporate es...",Your mind is the scene of the crime.,Released,2010-07-15 00:00:00,148,160000000,825532764,False,https://www.warnerbros.com/movies/inception,tt1375666,83.952,8.364,34495,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg
1,157336,Interstellar,Interstellar,The adventures of a group of explorers who mak...,Mankind was born on Earth. It was never meant ...,Released,2014-11-05 00:00:00,169,165000000,701729206,False,http://www.interstellarmovie.net/,tt0816692,140.241,8.417,32571,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg
2,155,The Dark Knight,The Dark Knight,Batman raises the stakes in his war on crime. ...,Welcome to a world without rules.,Released,2008-07-16 00:00:00,152,185000000,1004558444,False,https://www.warnerbros.com/movies/dark-knight/,tt0468569,130.643,8.512,30619,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg
3,19995,Avatar,Avatar,"In the 22nd century, a paraplegic Marine is di...",Enter the world of Pandora.,Released,2009-12-15 00:00:00,162,237000000,2923706026,False,https://www.avatar.com/movies/avatar,tt0499549,79.932,7.573,29815,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg
4,24428,The Avengers,The Avengers,When an unexpected enemy emerges and threatens...,Some assembly required.,Released,2012-04-25 00:00:00,143,220000000,1518815515,False,https://www.marvel.com/movies/the-avengers,tt0848228,98.082,7.710,29166,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,175574,Free Birds,Free Birds,Two turkeys from opposite sides of the tracks ...,Hang On To Your Nuggets,Released,2013-10-30 00:00:00,91,55000000,110000000,False,,tt1621039,17.418,5.862,714,/gnSU2wUBq2gTkBEkxY8C1d1fXAQ.jpg,/bYCYL5mVX4unStJy34wWuofVq2O.jpg
4996,4787,Cassandra's Dream,Cassandra's Dream,The tale of two brothers with serious financia...,Family is family. Blood is blood.,Released,2007-06-18 00:00:00,108,0,22687639,False,http://www.cassandrasdreammovie.com/,tt0795493,13.531,6.244,714,/hydGWxW9VvRXMwpwFacl7XVtoeR.jpg,/5Xb2e63jvhJqL6IUO2SqtuRIJRG.jpg
4997,659959,Summer of 85,Été 85,What do you dream of when you're 16-years-old ...,,Released,2020-07-14 00:00:00,101,6900000,3600000,False,,tt10457128,12.932,7.408,714,/rkJKDC5gYrPXQx9IqNaUhSb3beC.jpg,/oq1AkrBDj8hRVJqC2NDFT5R1NEa.jpg
4998,1040,The Leopard,Il gattopardo,As Garibaldi's troops begin the unification of...,Luchino Visconti's enduring romantic adventure,Released,1963-03-28 00:00:00,186,0,0,False,,tt0057091,13.924,7.672,714,/riSUxwoK3xjkOgy6YJSvPhi7cO6.jpg,/myikoqu8Z2gtE7TaBgk9r6fB9MF.jpg


In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from scipy import sparse
import io
import base64
import json

In [57]:
# overview TF-IDF

movie_df["overview"] = movie_df["overview"].fillna("")
tfidf = TfidfVectorizer(stop_words='english')

overview_tfidf_matrix = tfidf.fit_transform(movie_df["overview"])

sparse.save_npz("../../movie_vectors/overview_tfidf.npz", overview_tfidf_matrix)

In [58]:
movie_vectors = []

for i in range(overview_tfidf_matrix.shape[0]):
    row_csr = overview_tfidf_matrix[i]  # get sparse row
    buffer = io.BytesIO()
    sparse.save_npz(buffer, row_csr)   # serialize to bytes
    binary_data = buffer.getvalue()     # get bytes object
    
    movie_vectors.append({
        "movieId": int(movie_df.iloc[i]["id"]),  # your movie ID
        "overview": binary_data                  # bytes ready for bytea
    })

json_data = [
    {"movieId": mv["movieId"], "overview": base64.b64encode(mv["overview"]).decode()}
    for mv in movie_vectors
]

# Save to JSON
with open("../../movie_vectors/overview.json", "w") as f:
    json.dump(json_data, f)

In [59]:
# tagline TF-IDF

movie_df["tagline"] = movie_df["tagline"].fillna("")
tfidf = TfidfVectorizer(stop_words='english')

tagline_tfidf_matrix = tfidf.fit_transform(movie_df["tagline"])

sparse.save_npz("../../movie_vectors/tagline_tfidf.npz", tagline_tfidf_matrix)

In [60]:
movie_vectors = []

for i in range(tagline_tfidf_matrix.shape[0]):
    row_csr = tagline_tfidf_matrix[i]  # get sparse row
    buffer = io.BytesIO()
    sparse.save_npz(buffer, row_csr)   # serialize to bytes
    binary_data = buffer.getvalue()     # get bytes object
    
    movie_vectors.append({
        "movieId": int(movie_df.iloc[i]["id"]),  # your movie ID
        "tagline": binary_data                  # bytes ready for bytea
    })

json_data = [
    {"movieId": mv["movieId"], "tagline": base64.b64encode(mv["tagline"]).decode()}
    for mv in movie_vectors
]

# Save to JSON
with open("../../movie_vectors/tagline.json", "w") as f:
    json.dump(json_data, f)

In [61]:
tagline_tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 15070 stored elements and shape (5000, 3898)>

In [None]:
# keywords TF-IDF

keyword_df = pd.read_csv(os.path.join("../../db_tables", "Keyword_rows.csv"))
movie_keyword_df = pd.read_csv(os.path.join("../../db_tables", "MovieKeyword_rows.csv"))

top_movie_ids = set(movie_df["id"])
movie_keyword_df_filtered = movie_keyword_df[movie_keyword_df["movieId"].isin(top_movie_ids)]
movie_keyword_df_filtered

mk = movie_keyword_df_filtered.merge(
    keyword_df,
    left_on="keywordId",
    right_on="id",
    how="inner"
)

movie_keywords_text = (
    mk.groupby("movieId")["name"]
      .apply(lambda x: " ".join(x))
      .reset_index()
      .rename(columns={"name": "keywords"})
)

movie_keywords_text = movie_df[["id"]].merge(
    movie_keywords_text,
    left_on="id",
    right_on="movieId",
    how="left"
)

movie_keywords_text["keywords"] = movie_keywords_text["keywords"].fillna("").str.lower()
movie_keywords_text = movie_keywords_text.drop(columns=["movieId"])
movie_keywords_text = movie_keywords_text.rename(columns={"id": "movieId"}) 

movie_keywords_text

Unnamed: 0,movieId,keywords
0,27205,california airplane france kidnapping architec...
1,157336,space space station time warp family relations...
2,155,crime fighter district attorney chaos super po...
3,19995,space soldier futuristic nature alien space co...
4,24428,aftercreditsstinger alien invasion superhero t...
...,...,...
4995,175574,freedom holiday duringcreditsstinger thanksgiving
4996,4787,parent child relationship money love triangle ...
4997,659959,france boys' love (bl) summer job based on nov...
4998,1040,decadence pastor country estate monarchy paler...


In [63]:
tfidf = TfidfVectorizer(
    stop_words="english",
    token_pattern=r"(?u)\b\w+\b"  # good for short keywords
)

keywords_tfidf_matrix = tfidf.fit_transform(movie_keywords_text["keywords"])

sparse.save_npz("../../movie_vectors/keywords_tfidf.npz", keywords_tfidf_matrix)

movie_vectors = []
for i in range(keywords_tfidf_matrix.shape[0]):
    row_csr = keywords_tfidf_matrix[i]
    buffer = io.BytesIO()
    sparse.save_npz(buffer, row_csr)
    binary_data = buffer.getvalue()
    
    movie_vectors.append({
        "movieId": int(movie_keywords_text.iloc[i]["movieId"]),
        "keywords": binary_data
    })

json_data = [
    {"movieId": mv["movieId"], "keywords": base64.b64encode(mv["keywords"]).decode()}
    for mv in movie_vectors
]

with open("../../movie_vectors/keywords.json", "w") as f:
    json.dump(json_data, f)

In [64]:
keywords_tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 83227 stored elements and shape (5000, 8403)>

In [None]:
import pandas as pd
import os

person_df = pd.read_csv(os.path.join("../../db_tables", "Person_rows.csv"))
movie_cast_df = pd.read_csv(os.path.join("../../db_tables", "MovieCast_rows.csv"))

top_movie_ids = set(movie_df["id"])
movie_keyword_df_filtered = movie_keyword_df[movie_keyword_df["movieId"].isin(top_movie_ids)]
movie_keyword_df_filtered

mk = movie_keyword_df_filtered.merge(
    keyword_df,
    left_on="keywordId",
    right_on="id",
    how="inner"
)

movie_keywords_text = (
    mk.groupby("movieId")["name"]
      .apply(lambda x: " ".join(x))
      .reset_index()
      .rename(columns={"name": "keywords"})
)

movie_keywords_text = movie_df[["id"]].merge(
    movie_keywords_text,
    left_on="id",
    right_on="movieId",
    how="left"
)

movie_keywords_text["keywords"] = movie_keywords_text["keywords"].fillna("").str.lower()
movie_keywords_text = movie_keywords_text.drop(columns=["movieId"])
movie_keywords_text = movie_keywords_text.rename(columns={"id": "movieId"}) 

movie_keywords_text