In [36]:
import sys

sys.path.append("../../")
import os
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import pandas as pd
import requests
from dotenv import load_dotenv
from tqdm.notebook import tqdm
import loosejson
from config import settings


DOTENV_PATH = settings.BASE_DIR/".env"
load_dotenv(DOTENV_PATH)

ROOT_DIR = Path("/home/alron/movie-recommender")
MOVIES_FILEPATH = settings.BASE_DIR/'data/movies_metadata_fixed_posters.csv'
KEYWORDS_FILEPATH = settings.BASE_DIR/'data/keywords.csv'
TMDB_LINKS_FILEPATH = settings.BASE_DIR/'data/links.csv'

OUTPUT_FILEPATH = settings.BASE_DIR/"data/movies_metadata_fixed_posters_w_keywords.csv"

In [37]:
movies_df = pd.read_csv(MOVIES_FILEPATH, dtype={"id": object})
movies_df.head(3)

  movies_df = pd.read_csv(MOVIES_FILEPATH, dtype={"id": object})


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,poster_paths
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,/bdHG5Mo83VPobeZZdlSz0Y7HQHB.jpg
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,/1FSXpj5e8l4KH6nVFO5SPUeraOt.jpg


In [52]:
def dedupe_keywords(keywords_df):
    # Construct a unified id to keywords mapping without duplicates
    mapping = {}
    for idx, row in keywords_df.iterrows():
        id = row["id"]
        keyword_dicts = row["keywords"]

        if id in mapping:
            existing_keyword_dicts = mapping[id]
            existing_keyword_strings = set([x["name"] for x in existing_keyword_dicts])

            for keyword_dict in keyword_dicts:
                if keyword_dict["name"] not in existing_keyword_strings:
                    existing_keyword_dicts.add(keyword_dict)
        else:
            mapping[id] = keyword_dicts


    data = list(mapping.items())
    keywords_deduped_df = pd.DataFrame(data, columns=['id', 'keywords'])
    assert not keywords_deduped_df["id"].duplicated().any()
    return keywords_deduped_df
    

keywords_df = pd.read_csv(KEYWORDS_FILEPATH,  dtype={"id": object})
keywords_df["keywords"] = keywords_df["keywords"].apply(lambda x: loosejson.parse_loosely_defined_json(x))
keywords_df = dedupe_keywords(keywords_df)
keywords_df["keywords_human_readable"] = keywords_df["keywords"].apply(lambda arr: ", ".join([x["name"] for x in arr ]))
keywords_df.head(3)

Unnamed: 0,id,keywords,keywords_human_readable
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","jealousy, toy, boy, friendship, friends, rival..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","board game, disappearance, based on children's..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","fishing, best friend, duringcreditsstinger, ol..."


In [55]:
movies_with_keywords = movies_df.merge(keywords_df, how="left", on="id")
print(len(movies_with_keywords), len(movies_df))
assert len(movies_with_keywords) == len(movies_df)
movies_with_keywords.to_csv(OUTPUT_FILEPATH, index=False)

movies_with_keywords.head(3)

45466 45466


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,poster_paths,keywords,keywords_human_readable
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","jealousy, toy, boy, friendship, friends, rival..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,/bdHG5Mo83VPobeZZdlSz0Y7HQHB.jpg,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","board game, disappearance, based on children's..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,/1FSXpj5e8l4KH6nVFO5SPUeraOt.jpg,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","fishing, best friend, duringcreditsstinger, ol..."
