<a href="https://colab.research.google.com/github/sdhilip200/Content-Based-Recommendation---Good-Reads-data/blob/master/Recommendation_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [98]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import nltk
import os
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib import pyplot

from sentence_transformers import SentenceTransformer, util
import torch

In [69]:
# Reading data from the genre, name, and description columns
movies = pd.read_csv("imdb_movies.csv", usecols = ['names','genre', 'overview'], low_memory = True, on_bad_lines="skip")
books = pd.read_csv("books.csv", usecols = ['title','categories', 'description'], low_memory = True, on_bad_lines = "skip")
books.columns = ['names','genre', 'overview']

In [83]:
# Concatenate the film and book dataframes
df = pd.concat([movies, books], ignore_index=True)
df.head(5)

Unnamed: 0,names,genre,overview
0,Creed III,"Drama, Action","After dominating the boxing world, Adonis Cree..."
1,Avatar: The Way of Water,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...
2,The Super Mario Bros. Movie,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,..."
3,Mummies,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ..."
4,Supercell,Action,Good-hearted teenager William always lived in ...


In [84]:
# Combine the genre and overview information
df['overview'] = df.genre + ": " + df.overview
del df['genre']
df.dropna(inplace=True)
df.head(5)

Unnamed: 0,names,overview
0,Creed III,"Drama, Action: After dominating the boxing wor..."
1,Avatar: The Way of Water,"Science Fiction, Adventure, Action: Set more t..."
2,The Super Mario Bros. Movie,"Animation, Adventure, Family, Fantasy, Comedy:..."
3,Mummies,"Animation, Comedy, Family, Adventure, Fantasy:..."
4,Supercell,Action: Good-hearted teenager William always l...


In [86]:
# Get list of strings as input
corpus = df['overview'].to_list()
titles = df['names'].to_list()

# Convert to format required by the embedder
catalogue = list(zip(corpus, titles))

In [88]:
catalogue[305]

("Thriller,\xa0Drama: It's the final weeks of the most consequential presidential election in history. America is poised to elect either its first female president or its first viable independent candidate. Reporting history as it's made, an idealistic young journalist teams up with her idol, legendary journalist Nick Booker, to uncover a conspiracy that places the fate of the election, and the country, in their hands.",
 'The Independent')

# SBERT embeddings

In [90]:
embedder = SentenceTransformer('all-MiniLM-L12-v2')

In [97]:
embedding_cache_path = "stored_embed"
if not os.path.exists(embedding_cache_path):
    # read your corpus etc
    print("Encoding the corpus. This might take a while")
    corpus_embeddings = embedder.encode(corpus, show_progress_bar=True, convert_to_tensor=True)

    print("Storing file on disc")
    with open(embedding_cache_path, "wb") as fOut:
        pickle.dump({'corpus': corpus, 'embeddings': corpus_embeddings}, fOut)

else:
    print("Loading pre-computed embeddings from disc")
    with open(embedding_cache_path, "rb") as fIn:
        cache_data = pickle.load(fIn)
        corpus = cache_data['corpus']
        corpus_embeddings = cache_data['embeddings']

Encoding the corpus. This might take a while


Batches:   0%|          | 0/519 [00:00<?, ?it/s]

Storing file on disc


NameError: name 'pickle' is not defined

In [100]:
# Recommending the Top K similar books or movies

def recommendations(query, top_k = 3):
    '''
    Input: query as a single-item list
    Finds k nearest descriptions
    Output: titles for k nearest descriptions
    
    '''
    #Compute embeddings for the query
    query_embeddings = embedder.encode(query, convert_to_tensor=True)
    
    #Compute cosine-similarities for the query with each description
    cosine_scores = util.cos_sim(query_embeddings, corpus_embeddings)
    scores = cosine_scores[0].tolist() #get a list of scores

    #Find the pairs with the highest cosine similarity scores
    pairs = [{'index': [i], 'score': scores[i]} for i in range(len(scores)-1)]

    #Sort scores in decreasing order
    pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

    for pair in pairs[0:top_k]:
        i = pair['index'][0]
        print("Title: {} \nDescription: {}\n Score: {:.4f}".format(catalogue[i][1], catalogue[i][0], pair['score']))

## Run the function

In [101]:
# Get plot description from the user
query = input("Enter the plot of the film or book you are trying to remember: ")

Enter the plot of the film or book you are trying to remember:A man and a woman survive a plane crash in the mountains.


In [102]:
recommendations(query) 

Title: Survive 
Description: Thriller, Drama, Adventure: When their plane crashes on a remote snow-covered mountain, Jane and Paul have to fight for their lives as the only remaining survivors. Together they embark on a harrowing journey out of the wilderness.
 Score: 0.5107
Title: The Mountain Between Us 
Description: Drama, Adventure, Romance: Stranded on a mountain after a tragic plane crash, two strangers must work together to endure the extreme elements of the remote, snow-covered terrain. When they realize help is not coming, they embark on a perilous journey across hundreds of miles of wilderness, pushing each other to survive and discovering their inner strength.
 Score: 0.5076
Title: Horizon Line 
Description: Thriller: A couple flying on a small plane to attend a tropical island wedding must fight for their lives after their pilot suffers a heart attack.
 Score: 0.4906
