In [1]:
# !pip install pyarrow -q
# !pip install fastparquet -q

The goal of notebook is as follows:
1. we augment the imdb information with the metadata in movielens-20m dataset from https://grouplens.org/datasets/movielens/
2. we develop embedding of unified imdb meta from all sources.


In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_parquet('s3://mlsl-imdb-data/imdb_ml_10k_posters.parquet')

In [3]:
movies.head(2)

Unnamed: 0,titleId,originalTitle,genres,year,Actors,Directors,Producers,keyword,location,plot,rating,poster_url
0,tt0004972,The Birth of a Nation,"[Drama, History, War]",1915.0,"[Lillian Gish, Mae Marsh, Henry B. Walthall, M...",[D.W. Griffith],[],"[ku-klux-klan, civil-war, reconstruction-era, ...","[Thousand Oaks, Fullerton, Los Angeles, Califo...",The Stoneman family finds its friendship with ...,6.3,https://m.media-amazon.com/images/M/MV5BYTM4ZD...
1,tt0006333,"20,000 Leagues Under the Sea","[Action, Adventure, Sci-Fi]",1916.0,"[Allen Holubar, Dan Hanlon, Edna Pendleton, Cu...",[Stuart Paton],[],"[national-film-registry, public-domain, based-...","[Bahamas, California, Universal City, USA, New...",A French professor and his daughter accompany ...,6.2,https://m.media-amazon.com/images/M/MV5BMTQ0OT...


In [4]:
keyword_scores = pd.read_csv('../data/ml-20m/genome-scores.csv')
keyword_tags = pd.read_csv('../data/ml-20m/genome-tags.csv')

In [5]:
keyword_scores.head(2)

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025


In [6]:
keyword_tags.head(2)

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)


In [7]:
keyword_tags_dict = {}
for row in keyword_tags.itertuples():
    keyword_tags_dict[row.tagId] = row.tag

In [8]:
keyword_scores = keyword_scores[keyword_scores.relevance>0.7]

In [9]:
keyword_scores['keyword'] =  keyword_scores['tagId'].apply(lambda x: keyword_tags_dict[x])

In [10]:
keyword_scores

Unnamed: 0,movieId,tagId,relevance,keyword
28,1,29,0.89200,adventure
62,1,63,0.93325,animated
63,1,64,0.98575,animation
185,1,186,0.95650,cartoon
192,1,193,0.81925,cgi
...,...,...,...,...
11709637,131170,998,0.80550,survival
11709638,131170,999,0.81525,suspense
11709651,131170,1012,0.85725,technology
11709690,131170,1051,0.74175,twists & turns


In [11]:
links = pd.read_csv('../data/ml-latest-small/links.csv')
links.head(2)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0


In [12]:
movieId_dict = {}
for row in links.itertuples():
    movieId_dict[row.movieId] = row.imdbId

In [13]:
keyword_scores['imdbId'] = keyword_scores['movieId'].apply(lambda x: movieId_dict[x] if x in movieId_dict else -1)

In [14]:
keyword_scores = keyword_scores[keyword_scores.imdbId!=-1]
keyword_scores.head(2)

Unnamed: 0,movieId,tagId,relevance,keyword,imdbId
28,1,29,0.892,adventure,114709
62,1,63,0.93325,animated,114709


In [15]:
ml_ttid = list(set(links.imdbId.apply(lambda x: "{:07d}".format(x))))
ml_ttid = ['tt'+ttid for ttid in ml_ttid]

In [16]:
keyword_scores['imdbId'] = keyword_scores['imdbId'].apply(lambda x:  "tt{:07d}".format(x))
keyword_scores.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  keyword_scores['imdbId'] = keyword_scores['imdbId'].apply(lambda x:  "tt{:07d}".format(x))


Unnamed: 0,movieId,tagId,relevance,keyword,imdbId
28,1,29,0.892,adventure,tt0114709
62,1,63,0.93325,animated,tt0114709


In [17]:
keyword_scores = keyword_scores.groupby('imdbId')['keyword'].apply(list).reset_index()

In [18]:
keyword_scores.head(2)

Unnamed: 0,imdbId,keyword
0,tt0000417,"[aliens, artistic, astronauts, black and white..."
1,tt0004972,"[american civil war, civil war, controversial,..."


In [19]:
keyword_scores_dict = {}
for row in keyword_scores.itertuples():
    keyword_scores_dict[row.imdbId] = row.keyword

In [20]:
movies['ml_keyword'] = movies['titleId'].apply(lambda x: list(set(keyword_scores_dict[x])) if x in keyword_scores_dict else None)

In [21]:
movies.head(2)

Unnamed: 0,titleId,originalTitle,genres,year,Actors,Directors,Producers,keyword,location,plot,rating,poster_url,ml_keyword
0,tt0004972,The Birth of a Nation,"[Drama, History, War]",1915.0,"[Lillian Gish, Mae Marsh, Henry B. Walthall, M...",[D.W. Griffith],[],"[ku-klux-klan, civil-war, reconstruction-era, ...","[Thousand Oaks, Fullerton, Los Angeles, Califo...",The Stoneman family finds its friendship with ...,6.3,https://m.media-amazon.com/images/M/MV5BYTM4ZD...,"[silent, controversial, american civil war, in..."
1,tt0006333,"20,000 Leagues Under the Sea","[Action, Adventure, Sci-Fi]",1916.0,"[Allen Holubar, Dan Hanlon, Edna Pendleton, Cu...",[Stuart Paton],[],"[national-film-registry, public-domain, based-...","[Bahamas, California, Universal City, USA, New...",A French professor and his daughter accompany ...,6.2,https://m.media-amazon.com/images/M/MV5BMTQ0OT...,


In [22]:
movies['plot'].fillna('', inplace=True)

In [23]:
movies['genres'].fillna('', inplace=True)
movies['keyword'].fillna('', inplace=True)

In [24]:
movies['ml_keyword']= movies['ml_keyword'].apply(lambda x: x[0:10] if x is not None else [])
movies['keyword']= movies['keyword'].apply(lambda x: x[0:10] if x is not None else [])

In [25]:
movies['all_keywords'] = movies.apply(lambda x: x['genres'].tolist()+x['ml_keyword'][0:10]+x['keyword'][0:10].tolist(), axis=1)

In [26]:
movies['all_keywords']  =  movies['all_keywords'].apply(lambda x: ','.join(x))

In [27]:
movies.head(2)

Unnamed: 0,titleId,originalTitle,genres,year,Actors,Directors,Producers,keyword,location,plot,rating,poster_url,ml_keyword,all_keywords
0,tt0004972,The Birth of a Nation,"[Drama, History, War]",1915.0,"[Lillian Gish, Mae Marsh, Henry B. Walthall, M...",[D.W. Griffith],[],"[ku-klux-klan, civil-war, reconstruction-era, ...","[Thousand Oaks, Fullerton, Los Angeles, Califo...",The Stoneman family finds its friendship with ...,6.3,https://m.media-amazon.com/images/M/MV5BYTM4ZD...,"[silent, controversial, american civil war, in...","Drama,History,War,silent,controversial,america..."
1,tt0006333,"20,000 Leagues Under the Sea","[Action, Adventure, Sci-Fi]",1916.0,"[Allen Holubar, Dan Hanlon, Edna Pendleton, Cu...",[Stuart Paton],[],"[national-film-registry, public-domain, based-...","[Bahamas, California, Universal City, USA, New...",A French professor and his daughter accompany ...,6.2,https://m.media-amazon.com/images/M/MV5BMTQ0OT...,[],"Action,Adventure,Sci-Fi,national-film-registry..."


In [28]:
movies['plot_kw'] = movies['plot'] + movies['all_keywords'] 

In [29]:
## Using T5 to generate embeddings
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("sentence-transformers/gtr-t5-large")

In [None]:
embeddings = model.encode(movies['plot_kw'].values)
np.save("../data/plot_kw_embeddings.npy", embeddings)

In [31]:
embeddings.shape

(9086, 768)

In [70]:
# embeddings = model.encode(movies['plot'].values)
# np.save("../data/plot_embeddings.npy", embeddings)
# embeddings = model.encode(movies['all_keywords'].values)
# np.save("../data/keyword_embeddings.npy", embeddings)

In [29]:
plot_embeddings = np.load("../data/plot_embeddings.npy")
keyword_embeddings = np.load("../data/keyword_embeddings.npy")
plot_keyword_embeddings = np.load("../data/plot_kw_embeddings.npy")

In [30]:
movies['plot_emb'] = plot_embeddings.tolist()
movies['keyword_emb'] = keyword_embeddings.tolist()
movies['plot_keyword_emb'] = plot_keyword_embeddings.tolist()

In [31]:
movies = movies[['titleId','all_keywords', 'plot', 'plot_emb', 'keyword_emb', 'originalTitle', 'poster_url',\
       'Directors', 'Producers', 'Actors', 'rating', 'location', 'genres', 'year', 'plot_keyword_emb', 'plot_kw'
       ]]

In [32]:
movies.rename(columns={'Directors': 'directors', 'Producers': 'producers', 'Actors': 'stars', 'originalTitle':'title'}, inplace=True)

In [33]:
movies['plotLong'] = movies['plot']

In [34]:
movies[['titleId','all_keywords', 'plot', 'plot_emb', 'keyword_emb', 'plot_keyword_emb', 'title', 'poster_url',\
       'directors', 'producers', 'stars', 'rating', 'location', 'genres', 'year', 'plotLong', 'plot_kw'
       ]].to_parquet('../data/plot_keyword_embeddings.parquet', index=False)

In [None]:
# !aws s3 cp ../data/plot_keyword_embeddings.parquet s3://mlsl-imdb-data/