<a href="https://colab.research.google.com/github/sdhilip200/Content-Based-Recommendation---Good-Reads-data/blob/master/Recommendation_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import nltk
import os
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib import pyplot

from sentence_transformers import SentenceTransformer, util
import torch

In [6]:
# Reading data from the genre, name, and description columns
movies = pd.read_csv("imdb_movies.csv", usecols = ['names','genre', 'overview'], low_memory = True, on_bad_lines="skip")
books = pd.read_csv("books.csv", usecols = ['title','categories', 'description'], low_memory = True, on_bad_lines = "skip")
books.columns = ['names','genre', 'overview']

In [10]:
movies["medium"] = "Film"
books["medium"] = "Book"

In [11]:
# Concatenate the film and book dataframes
df = pd.concat([movies, books], ignore_index=True)
df.head(5)

Unnamed: 0,names,genre,overview,medium
0,Creed III,"Drama, Action","After dominating the boxing world, Adonis Cree...",Film
1,Avatar: The Way of Water,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,Film
2,The Super Mario Bros. Movie,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...",Film
3,Mummies,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...",Film
4,Supercell,Action,Good-hearted teenager William always lived in ...,Film


In [12]:
# Combine the genre and overview information
df['overview'] = df.genre + ": " + df.overview
del df['genre']
df.dropna(inplace=True)
df.head(5)

Unnamed: 0,names,overview,medium
0,Creed III,"Drama, Action: After dominating the boxing wor...",Film
1,Avatar: The Way of Water,"Science Fiction, Adventure, Action: Set more t...",Film
2,The Super Mario Bros. Movie,"Animation, Adventure, Family, Fantasy, Comedy:...",Film
3,Mummies,"Animation, Comedy, Family, Adventure, Fantasy:...",Film
4,Supercell,Action: Good-hearted teenager William always l...,Film


In [13]:
# Get list of strings as input
corpus = df['overview'].to_list()
titles = df['names'].to_list()
medium = df['medium'].to_list()

# Convert to format required by the embedder
catalogue = list(zip(corpus, titles, medium))

In [14]:
catalogue[305]

("Thriller,\xa0Drama: It's the final weeks of the most consequential presidential election in history. America is poised to elect either its first female president or its first viable independent candidate. Reporting history as it's made, an idealistic young journalist teams up with her idol, legendary journalist Nick Booker, to uncover a conspiracy that places the fate of the election, and the country, in their hands.",
 'The Independent',
 'Film')

# SBERT embeddings

In [23]:
# Function to get embeddings or load existing ones
def get_embeds(path, model ='all-MiniLM-L12-v2'):
    if not os.path.exists(path):
        embedder = SentenceTransformer(model)
        print("Encoding the corpus. This might take a while")
        corpus_embeddings = embedder.encode(corpus, show_progress_bar=True, convert_to_tensor=True)

        print("Storing file on disc")
        with open(path, "wb") as fOut:
            pickle.dump({'corpus': corpus, 'embeddings': corpus_embeddings}, fOut)
    else:
        print("Loading pre-computed embeddings from disc")
        with open(path, "rb") as fIn:
            cache_data = pickle.load(fIn)
            corpus = cache_data['corpus'] # TODO: Do we even need this?
            corpus_embeddings = cache_data['embeddings']
    return corpus_embeddings

In [30]:
# Recommending the Top K similar books or movies

def recommendations(query, corpus_embeddings, top_k = 3,model ='all-MiniLM-L12-v2'):
    '''
    Input: query as a single-item list
    Finds k nearest descriptions
    Output: titles for k nearest descriptions
    
    '''
    embedder = SentenceTransformer(model)
    #Compute embeddings for the query
    query_embeddings = embedder.encode(query, convert_to_tensor=True)
    
    #Compute cosine-similarities for the query with each description
    cosine_scores = util.cos_sim(query_embeddings, corpus_embeddings)
    scores = cosine_scores[0].tolist() #get a list of scores

    #Find the pairs with the highest cosine similarity scores
    pairs = [{'index': [i], 'score': scores[i]} for i in range(len(scores)-1)]

    #Sort scores in decreasing order
    pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

    for pair in pairs[0:top_k]:
        i = pair['index'][0]
        print("Title: {}\nMedium : {}\nDescription: {}\nScore: {:.4f}\n".format(catalogue[i][1], catalogue[i][2], catalogue[i][0], pair['score']))

## Run the function

In [None]:
# Get plot description from the user
query = input("Enter the plot of the film or book you are trying to remember: ")

In [31]:
# Define path for stored embeddings
path = "stored_embed"
corpus_embeddings = get_embeds(path)
recommendations(query, corpus_embeddings) 

Loading pre-computed embeddings from disc
Title: Breakout
Medium : Film
Description: Action, Thriller: A pair of criminals try to track down the kids who witnessed them commit a murder in the woods.
Score: 0.5460

Title: The Monster
Medium : Film
Description: Horror, Drama: A mother and her 10-year old daughter are trapped in a forest. There is something in this forest. Something unlike anything they have heard before. Something that lurks in the darkness and it’s coming after them.
Score: 0.5140

Title: The Mimic
Medium : Film
Description: Horror, Thriller: The mother of a missing child takes in a lost girl she finds in the woods, but soon begins to wonder if she is even human.
Score: 0.4876

