<a href="https://colab.research.google.com/github/sdhilip200/Content-Based-Recommendation---Good-Reads-data/blob/master/Recommendation_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
# !pip install pandas gradio numpy nltk torch sentence_transformers matplotlib

In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import nltk
import os
import pickle
import gradio as gr

import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib import pyplot

from sentence_transformers import SentenceTransformer, util
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Define constants
model = 'all-MiniLM-L12-v2'

## Read Training Data

In [47]:
def process_movie_data():
    # Reading data from the genre, name, and description columns
    wiki = pd.read_csv("wiki_movie_plots_deduped.csv", usecols = ['Title', 'Plot'], low_memory = True, on_bad_lines="skip")
    wiki.columns = ['title', 'overview']
    imdb = pd.read_csv("imdb_25k.csv", usecols = ['movie title', 'Overview'], low_memory = True, on_bad_lines="skip")
    imdb.columns = ['title', 'overview']

    # Concatenate the film and book dataframes
    # movies = pd.concat([wiki, imdb], ignore_index=True).drop_duplicates('title')
    
    movies = pd.concat([wiki, imdb], ignore_index=False)
    movies.groupby(['title'])['overview'].sum().reset_index()
    print(movies.head(3))
    movies.dropna(how='any', inplace=True) 

    print(movies.info())

    # Get list of strings as input
    corpus = movies['overview'].to_list()
    titles = movies['title'].to_list()

    catalogue = list(zip(corpus, titles))
    return corpus, catalogue

In [48]:
def embed_corpus(model, corpus):
    embedder = SentenceTransformer(model)
    print("Encoding the corpus. This might take a while")
    corpus_embeddings = embedder.encode(corpus, show_progress_bar=True, convert_to_tensor=True)
    return corpus_embeddings

# Load Embeddings

In [49]:
# Function to get embeddings or load existing ones
def get_embeds(path, model ='all-MiniLM-L12-v2'):
    if not os.path.exists(path):
        corpus, catalogue  = process_movie_data()
        corpus_embeddings = embed_corpus(model, corpus)       
        print("Storing file on disc")
        with open(path, "wb") as fOut:
            pickle.dump({'catalogue': catalogue, 'embeddings': corpus_embeddings}, fOut)
    else:
        print("Loading pre-computed embeddings from disc")
        with open(path, "rb") as fIn:
            cache_data = pickle.load(fIn)
            catalogue = cache_data['catalogue'] 
            corpus_embeddings = cache_data['embeddings']
        print("Embeddings loaded")
    return catalogue, corpus_embeddings

# SBERT embeddings

In [50]:
# Define path for stored embeddings
path = "stored_embed"
catalogue, corpus_embeddings = get_embeds(path)

                           title  \
0         Kansas Saloon Smashers   
1  Love by the Light of the Moon   
2        The Martyred Presidents   

                                            overview  
0  A bartender is working at a saloon, serving dr...  
1  The moon, painted with a smiling face hangs ov...  
2  The film, just over a minute long, is composed...  
<class 'pandas.core.frame.DataFrame'>
Index: 59044 entries, 0 to 24401
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     59044 non-null  object
 1   overview  59044 non-null  object
dtypes: object(2)
memory usage: 1.4+ MB
None
Encoding the corpus. This might take a while


Batches: 100%|█████████████████████████████████████████████████████████████████████| 1846/1846 [49:15<00:00,  1.60s/it]


Storing file on disc


In [51]:
# Recommending the Top K similar books or movies

def recommendations(query, corpus_embeddings=corpus_embeddings, top_k = 3,model ='all-MiniLM-L12-v2'):
    '''
    Input: query as a single-item list
    Finds k nearest descriptions
    Output: titles for k nearest descriptions
    
    '''
    embedder = SentenceTransformer(model)
    #Compute embeddings for the query
    query_embeddings = embedder.encode(query, convert_to_tensor=True)
    
    #Compute cosine-similarities for the query with each description
    cosine_scores = util.cos_sim(query_embeddings, corpus_embeddings)
    scores = cosine_scores[0].tolist() #get a list of scores

    #Find the pairs with the highest cosine similarity scores
    pairs = [{'index': [i], 'score': scores[i]} for i in range(len(scores)-1)]

    #Sort scores in decreasing order
    pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

    results = ""
    for pair in pairs[0:top_k]:
        i = pair['index'][0]
        result = "<p>Title: {}<br>Description: {}<p>Score: {:.4f}<p>".format(catalogue[i][1], # try yield?
                                                                                         catalogue[i][0],                                                                                         
                                                                                         pair['score'])
        results += result

    return results

# Run the function

In [52]:
demo = gr.Interface(fn=recommendations, 
                    inputs=[gr.Textbox(placeholder="Enter plot summary here...")], 
                    outputs=[ gr.HTML()])

demo.launch(share=True)  

Running on local URL:  http://127.0.0.1:7860

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


