In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from dotenv import load_dotenv
load_dotenv()
import sys
sys.path.append("./")

In [2]:
from src.utils.opensearch import send_request

In [3]:
opensearch_user = os.getenv("opensearch_user")
opensearch_password = os.getenv("opensearch_password")

In [4]:
DATASET_PATH = "./data/movie_dataset.csv"

In [5]:
movie_df = pd.read_csv(DATASET_PATH)

In [6]:
movie_df.head()

Unnamed: 0,movie_title,director_name,genres,plot_keywords,movie_imdb_link,title_year,content_rating
0,Avatar,James Cameron,Action|Adventure|Fantasy|Sci-Fi,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,2009.0,PG-13
1,Pirates of the Caribbean: At World's End,Gore Verbinski,Action|Adventure|Fantasy,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,2007.0,PG-13
2,Spectre,Sam Mendes,Action|Adventure|Thriller,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,2015.0,PG-13
3,The Dark Knight Rises,Christopher Nolan,Action|Thriller,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,2012.0,PG-13
4,John Carter,Andrew Stanton,Action|Adventure|Sci-Fi,alien|american civil war|male nipple|mars|prin...,http://www.imdb.com/title/tt0401729/?ref_=fn_t...,2012.0,PG-13


In [23]:
from typing import Any, Optional


def return_none_when_is_nan(value: Any) -> Optional[Any]:
    try:
        np.isnan(value)
    except TypeError:
        return value
    
    if np.isnan(value):
        return None
    return value

In [25]:
idx = 1

for row in tqdm(range(200)):
    genres_text = None
    plot_keywords_text = None
    movie_description = None

    movie_title     = return_none_when_is_nan(movie_df.loc[row, "movie_title"])
    director_name   = return_none_when_is_nan(movie_df.loc[row, "director_name"])
    movie_imdb_link = return_none_when_is_nan(movie_df.loc[row, "movie_imdb_link"])
    content_rating  = return_none_when_is_nan(movie_df.loc[row, "content_rating"])
    genres          = return_none_when_is_nan(movie_df.loc[row, "genres"])

    if genres is not None:
        genres = genres.split("|")
        genres_text = ", ".join(genres)

    plot_keywords= return_none_when_is_nan(movie_df.loc[row, "plot_keywords"])
    if plot_keywords is not None:
        plot_keywords = plot_keywords.split("|")
        plot_keywords_text = ", ".join(plot_keywords)
        
    title_year = return_none_when_is_nan(movie_df.loc[row, "title_year"])

    if title_year is not None:
        title_year = str(int(title_year))

    if movie_title is not None:
        movie_description = f"{movie_title} is a {title_year} {genres_text} movie directed by {director_name}. Keywords: {plot_keywords_text}. Rated {content_rating}."
    
    json_data = {
        "id": str(idx),
        "movie_title": movie_title.strip(),
        "director_name": director_name.strip(),
        "genres": genres,
        "plot_keywords": plot_keywords,
        "title_year": title_year,
        "movie_imdb_link": movie_imdb_link,
        "content_rating": content_rating,
        "movie_description": movie_description
    }
    response = send_request(
        method="put",
        user=opensearch_user,
        password=opensearch_password,
        endpoint=f"movie-search-index/_doc/{idx}",
        json_data=json_data,
        host="https://localhost",
        port=9200
    )
    idx += 1
    

100%|██████████| 200/200 [00:44<00:00,  4.48it/s]
