In [463]:
## Import packages

import pandas as pd
import numpy as np
import json
import random

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tensorflow.keras import layers, Sequential
from sklearn.model_selection import train_test_split

In [505]:
## Import metadata

book_metadata_url = "/Users/egmac/code/arostagnat/raw_data/raw_book/metadata"
movie_metadata_url = "/Users/egmac/code/arostagnat/raw_data/raw_movies/metadata"

book_details = pd.read_json(book_metadata_url+".json",lines=True)
movie_details = pd.read_json(film_metadata_url+".json",lines=True)

In [None]:
## Import data. Make sure to replace sample data with full data when available
book_URL = '/Users/egmac/code/arostagnat/raw_data/sample_books_reviews_clean.csv'
movie_URL = '/Users/egmac/code/arostagnbat/raw_data/sample_movies_reviews_clean.csv'
book_data = pd.read_csv(book_URL,names=["index","item_id","txt"],header=0,index_col="index",usecols=["index","item_id","txt"])
movie_data = pd.read_csv(movies_URL,names=["index","item_id","txt"],header=0,index_col="index",usecols=["index","item_id","txt"])

In [None]:
## Set tag for book / movie
book_data["type"] = 1
movie_data["type"] = 0

In [None]:
## Concatenate book and movies data
data = pd.concat([movie_data,book_data])
data = data.reset_index()
data.tail(3)

In [None]:
# book_data.head()
# movie_data.head()

In [None]:
## Set up training and test datasets
data_train, data_test = train_test_split(data,test_size=0.2)
print(f"data_train: {data_train.shape}, data_test: {data_test.shape}")

In [None]:
## Reset index to match document tagging
data_train_copy = data_train.copy()
data_train_copy = data_train_copy.reset_index()
data_train_copy.index.name = "new_index"

In [None]:
## Add required tagging
data_train_tag = [TaggedDocument(doc, [i]) for i, doc in enumerate(data_train["txt"])]
print(f"data_train_tag: {len(data_train_tag)}")

In [None]:
## Setup Doc2Vec model
model = Doc2Vec(vector_size=50, min_count=5)

In [None]:
## Build vocab
model.build_vocab(data_train_tag)

In [None]:
## Train model
model.train(data_train_tag,epochs=50,word_count=0,total_examples=model.corpus_count)

In [None]:
## Confirm output of txt column
print(data.loc[1]["txt"])
type(data.loc[1]["txt"])

In [None]:
## Obtain inferred vectors for each film / book
vectors_list = []

for i in data_train.index:
    text = data_train.loc[i]["txt"]
    text_cleaned = text.strip('[]').replace("'","").replace(' ', '').split(',')
    inferred_vector = model.infer_vector(text_cleaned)
    vectors_list.append(inferred_vector)

In [None]:
## Add vectors to data_train
data_train["vector"] = vectors_list

In [None]:
data_train.head(3)

In [458]:
## Obtain top 10 similar items for each film / book
top_book_dict = {}

for i in data_train.index[:20]:
    vector = data_train.loc[i]["vector"]
    similar_items = model.dv.most_similar([vector],topn=1000)
    results = pd.DataFrame(similar_items,columns=["new_index","cosine"]).set_index("new_index")
    results = results.merge(data_train_copy[["item_id","type"]],how="left",on="new_index")
    results_copy = results[results.type == 1].sort_values(by=["cosine"],ascending=False).reset_index()
    top_book_dict[i] = results_copy.loc[0]

In [None]:
## Add top books and cosines to data_train

top_books = [top_book_dict[i]["item_id"] for i in data_train.index]
top_cosine = [top_book_dict[i]["cosine"] for i in data_train.index]

data_train["top_book"] = top_books
data_train["cosine"] = top_cosine

In [None]:
data_test.loc[movie_index]["item_id"]

In [643]:
data_train_films = data_train[data_train.type == 0]
data_train_films.item_id = data_train_films.item_id.astype(int)
movie_details.item_id = movie_details.item_id.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_train_films.item_id = data_train_films.item_id.astype(int)


In [644]:
data_train_films = data_train_films.merge(movie_details[["title","item_id"]],how="left",on="item_id")

In [666]:
keyword = "potter"
data_train_films[data_train_films["title"].str.contains(keyword,case=False,na=False)]

484       4896
670      54001
770       5816
1244     40815
1757     54001
         ...  
42441     5816
42787     8368
42824     8368
42835    69844
42900    88125
Name: item_id, Length: 163, dtype: int64

In [664]:
## Define formula to provide recommendations for random vectors

def get_book_recc(movie_id):
    
    movie_index = data_train[data_train["item_id"] == movie_id].index[0]
    
    text = data_train.loc[movie_index]["txt"]
    text_cleaned = text.strip('[]').replace("'","").replace(' ', '').split(',')
    
    inferred_vector = model.infer_vector(text_cleaned)
    similar_items = model.dv.most_similar([inferred_vector], topn=1000)
    
    results = pd.DataFrame(similar_items,columns=["new_index","cosine"]).set_index("new_index")
    results = results.merge(data_train_copy[["item_id","type"]],how="left",on="new_index")
    results_copy = results[results.type == 1].sort_values(by=["cosine"],ascending=False).reset_index()
    book_id_1 = results_copy.loc[0]["item_id"]
    book_id_2 = results_copy.loc[1]["item_id"]
    book_id_3 = results_copy.loc[2]["item_id"]
    
    movie_name = movie_details[movie_details["item_id"]==movie_id]["title"].to_string()
    book_1_name = book_details[book_details["item_id"]==book_id_1]["title"].to_string()
    book_2_name = book_details[book_details["item_id"]==book_id_2]["title"].to_string()
    book_3_name = book_details[book_details["item_id"]==book_id_3]["title"].to_string()
    
    print(f"""Film: {movie_name} 
    \
    \
    \
    ==== Book recommendations ====
    1. {book_1_name}
    2. {book_2_name}
    3. {book_3_name}""")

In [693]:
keyword = "potter"
sample = data_train_films[data_train_films["title"].str.contains(keyword,case=False,na=False)].item_id

In [696]:
potter = [54001,40815,69844,5816]

for item in potter:
    get_book_recc(movie_id=item)

Film: 12005    Harry Potter and the Order of the Phoenix (2007) 
                ==== Book recommendations ====
    1. 2244    Maus II: A Survivor's Tale: And Here My Troubl...
    2. 32    The Alchemist
    3. 2543    The 4-Hour Workweek
Film: 10623    Harry Potter and the Goblet of Fire (2005) 
                ==== Book recommendations ====
    1. 2952    My Sister's Grave (Tracy Crosswhite, #1)
    2. 547    Through the Woods
    3. 2524    Still Life with Bread Crumbs
Film: 13988    Harry Potter and the Half-Blood Prince (2009) 
                ==== Book recommendations ====
    1. 5913    Confessions of a Prairie Bitch: How I Survived...
    2. 220    Wicked: The Life and Times of the Wicked Witch...
    3. 5502    Save the Cat!: The Last Book on Screenwriting ...
Film: 5718    Harry Potter and the Chamber of Secrets (2002) 
                ==== Book recommendations ====
    1. 2128    Meant to Be
    2. 1143    Fracture Me (Shatter Me, #2.5)
    3. 295    Dracula
