In [None]:
#!pip install -U sentence-transformers
#!pip install datasets

In [None]:
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import time
import gzip
import os
import torch

if not torch.cuda.is_available():
  print("Warning: No GPU found. Please add GPU to your notebook")


# We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
model_name = 'nq-distilbert-base-v1'
bi_encoder = SentenceTransformer(model_name)
top_k = 5  # Number of passages we want to retrieve with the bi-encoder


from datasets import load_dataset
ds = load_dataset("Coder-Dragon/wikipedia-movies", split='train[:1000]')



In [None]:
ds

Dataset({
    features: ['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast', 'Genre', 'Wiki Page', 'Plot', 'Image'],
    num_rows: 1000
})

In [None]:
titles = ds['Title']
plots = ds['Plot']

In [None]:
# To speed things up, pre-computed embeddings are downloaded.
# The provided file encoded the passages with the model 'nq-distilbert-base-v1'
if model_name == 'nq-distilbert-base-v1':
    corpus = [title + " " + plot for title, plot in zip(titles, plots)]
    corpus_embeddings = bi_encoder.encode(corpus, convert_to_tensor=True, show_progress_bar=True)
    #if torch.cuda.is_available():
    corpus_embeddings = corpus_embeddings.to('cuda')

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [None]:
def search(query):
    # Encode the query using the bi-encoder and find potentially relevant passages
    start_time = time.time()
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query
    end_time = time.time()

    # Output of top-k hits
    print("Input question:", query)
    print("Results (after {:.3f} seconds):".format(end_time - start_time))
    for hit in hits:
       print("\t{:.3f}\t{}".format(hit['score'], titles[hit['corpus_id']]))

In [None]:
search(query = "Documentaries showcasing indigenous peoples' survival and daily life in Arctic regions")

Input question: Documentaries showcasing indigenous peoples' survival and daily life in Arctic regions
Results (after 0.025 seconds):
	0.452	Nanook of the North
	0.313	David Copperfield
	0.275	Straight Shooting
	0.268	The Frozen North
	0.244	The Salvation Hunters


In [None]:
search(query = "Western romance")

Input question: Western romance
Results (after 0.021 seconds):
	0.305	Romance
	0.273	The Great Gatsby
	0.269	Youth's Endearing Charm
	0.268	A Little Journey
	0.263	Frankenstein


In [None]:
search(query = "Silent film about a Parisian star moving to Egypt, leaving her husband for a baron, and later reconciling after finding her family in poverty in Cairo.")

Input question: Silent film about a Parisian star moving to Egypt, leaving her husband for a baron, and later reconciling after finding her family in poverty in Cairo.
Results (after 0.011 seconds):
	0.497	Married in Hollywood
	0.441	The Gay Deceiver
	0.419	One Hysterical Night
	0.417	Fifty Million Frenchmen
	0.415	The King on Main Street


In [None]:
search(query = "Comedy film, office disguises, boss's daughter, elopement.")

Input question: Comedy film, office disguises, boss's daughter, elopement.
Results (after 0.030 seconds):
	0.393	Dressed to Kill
	0.388	Youth's Endearing Charm
	0.366	A Little Journey
	0.354	The Boy Friend
	0.330	Manhandled


In [None]:
search(query = "Lost film, Cleopatra charms Caesar, plots world rule, treasures from mummy, revels with Antony, tragic end with serpent in Alexandria.")

Input question: Lost film, Cleopatra charms Caesar, plots world rule, treasures from mummy, revels with Antony, tragic end with serpent in Alexandria.
Results (after 0.012 seconds):
	0.560	Cleopatra
	0.372	The Lost World
	0.356	The Man Who Lost Himself
	0.336	The Golden Louis
	0.318	Reaching for the Moon


In [None]:
search(query = "Denis Gage Deane-Tanner")

Input question: Denis Gage Deane-Tanner
Results (after 0.020 seconds):
	0.360	The Man in Possession
	0.338	Souls for Sale
	0.336	The Man from Blankley's
	0.305	Old Clothes
	0.303	Blind Youth


# Analysis

Recall@1:
 2 positive and 6 total, so the Recall@1 is 0.333

Mean Reciprocal Rank (MRR):

The MRR would be 1/1, 0/5, 0/5, 1/1, 0/5 and 0/5. so the total 2/6 = 0.333

What type of queries tend to do well? Which not so well?


The queries I noticed that do well are queries that have more unique words. For exmaple, the first query talked about "indigenous people" and "artic" these two words are more unique then "comedy" or "western romance". Because of how semantic search works with cosine similarity, if you use more unique words and especially if they combo with one another in other text files (like "indigenous" and "artic") the model is a lot more accurate.


For the queries that the model didn’t perform
well, what could be two alternative
approaches?

1. More descriptive queries, for example: "western romance" is a vague query. I am not sure if a human could figure that out, let alone a machine or model.

2. Another way could be affecting the weight of how title and plot are used in the query. I think this could allow the model to run a better semantic search as the plot has more information then the title.