In [143]:
import torch
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

In [163]:
from pypdf import PdfReader

reader = PdfReader("data/in/Harry Potter and the Half-Blood Prince.pdf")

In [164]:
len(reader.pages)

162

In [171]:
script = "\n".join(
    [page.extract_text() for page in reader.pages]
)
print(len(script))
print(set(script))
print()
print(script[:1000])

192985
{'6', 't', 'X', '7', 'a', ':', '.', '©', 'o', '-', 'f', 'B', 'l', ')', 'g', 'L', '*', '5', 'N', '4', '(', '?', 'P', 'Q', 'm', 'c', 'x', 'y', 'j', 'W', 'u', 'F', 'i', 'k', 'A', 'v', '‘', 'T', '\n', 'V', 'r', ';', '9', 'H', 'z', 'e', 'E', 'J', '!', '“', '2', 'M', '+', '0', 'Y', '1', 'h', 'q', "'", 'd', 'D', 's', '&', ',', 'n', 'G', '’', 'K', 'w', 'I', 'b', 'S', 'R', ' ', 'C', 'O', 'U', '/', 'p', '8', '3', 'Z', '”'}

Rev. 09/13/07 (Blue)
Rev. 10/02/07 (Pink)
Rev. 11/06/07 (Yellow)
HARRY POTTER AND THE HALF-BLOOD PRINCE
screenplay by
Steve Kloves
 based on the novel by J.K. Rowling
This script is the confidential and proprietary 
property of Warner Bros. Pictures and no portion of 
it may be performed, distributed, reproduced, used, 
quoted or published without prior written permission.
August 28, 2007
WARNER BROS. PICTURES INC. © 2007
4000 Warner Boulevard WARNER BROS. ENT.
Burbank, California  91522 All Rights Reserved
DARKNESS.
THUNDER -- or something like it -- sounds in the 
di

In [170]:
print(reader.pages[0].extract_text() + reader.pages[1].extract_text())

Rev. 09/13/07 (Blue)
Rev. 10/02/07 (Pink)
Rev. 11/06/07 (Yellow)
HARRY POTTER AND THE HALF-BLOOD PRINCE
screenplay by
Steve Kloves
 based on the novel by J.K. Rowling
This script is the confidential and proprietary 
property of Warner Bros. Pictures and no portion of 
it may be performed, distributed, reproduced, used, 
quoted or published without prior written permission.
August 28, 2007
WARNER BROS. PICTURES INC. © 2007
4000 Warner Boulevard WARNER BROS. ENT.
Burbank, California  91522 All Rights ReservedDARKNESS.
THUNDER -- or something like it -- sounds in the 
distance.
BOOM.  Then again.  BOOM.
We GLIDE THROUGH the inky blackness.  Ambient flashes 
illuminate the silhouette of the WB LOGO.  We PASS 
THROUGH.
INTO more darkness.  Lost.  More FLASHES.  And we --
CUT TO:
A SINGLE EYE
Blank behind glasses.  FLASH!  The PUPIL CONTRACTS and  
we --
CUT TO:
INT. MINISTRY OF MAGIC - ATRIUM - WIDER ANGLE - DAY1 1
To find... HARRY POTTER, standing numbly beside ALBUS 
DUMBLEDORE amidst a M

In [146]:
df = pl.read_parquet("data/out/internet-movie-script.parquet").with_row_index()
embeddings = torch.load("data/out/scripts-embedded.pt", weights_only=True)
print(df.shape)
print(embeddings.shape)
print(embeddings.dtype)

(59, 13)
torch.Size([59, 1024])
torch.float32


In [147]:
embeddings_duplicated_over_0 = embeddings.unsqueeze(1).expand(-1, embeddings.shape[0], -1)
embeddings_duplicated_over_1 = embeddings.unsqueeze(0).expand(embeddings.shape[0], -1, -1)
print(embeddings_duplicated_over_0.shape)
print(embeddings_duplicated_over_1.shape)

print(
    (embeddings_duplicated_over_0[1, 13, :] == embeddings[1, :]).all()
)

print(
    (embeddings_duplicated_over_1[13, 1, :] == embeddings[1, :]).all()
)

torch.Size([59, 59, 1024])
torch.Size([59, 59, 1024])
tensor(True)
tensor(True)


In [148]:
cos_sim_layer = torch.nn.CosineSimilarity(dim=2)

cos_sim = cos_sim_layer(embeddings_duplicated_over_0, embeddings_duplicated_over_1)

print(cos_sim.shape)

dotprod = torch.tensordot(embeddings, embeddings, dims=([1], [1]))

print(dotprod.shape)

torch.Size([59, 59])
torch.Size([59, 59])


In [149]:
np_cos_sim = cos_sim.flatten().numpy()
np_dotprod = dotprod.flatten().numpy()

csdf = pl.DataFrame(np_cos_sim)
dtdf = pl.DataFrame(np_dotprod)

csdf.columns = ["cos"]
dtdf.columns = ["dot"]

simdf = csdf.with_row_index().join(
    dtdf.with_row_index(),
    on=pl.col("index"),
    how="inner"
)
print(simdf.shape)
simdf.select(pl.col("cos", "dot")).corr()


(3481, 3)


cos,dot
f64,f64
1.0,0.68423
0.68423,1.0


In [154]:
def rank_similarity(
    movie_title: str,
    rank_expr: str | pl.Expr,
    print_similarity_correlation: bool = True
) -> pl.DataFrame:
    
    idx = get_index_in_df_from_title(movie_title)
    

    simdf = pl.DataFrame(
        {"dotprod": dotprod[idx].numpy(), "cos_sim": cos_sim[idx].numpy()}
    ).with_row_index()\
        .join(
            df.select(pl.col("index", "movie_title")),
            on = pl.col("index"),
            how = "inner"
        )\
        .filter(pl.col("index") != idx) # filter out exact match
    
    if print_similarity_correlation is True:
        print_similarity_correlation_helper(simdf)

    return simdf.sort(rank_expr, descending=True)

def get_index_in_df_from_title(movie_title: str) -> int:
    idx = df.filter(pl.col("movie_title").str.to_lowercase() == movie_title.lower()).select(pl.col("index"))
    if len(idx) < 1:
        raise ValueError(f"Could not find movie {movie_title}")
    elif len(idx) > 1:
        raise ValueError(f"Found multiple matches for movie {movie_title}\n{idx}")
    else:
        idx = idx[0, "index"]
        return idx
    
def print_similarity_correlation_helper(
    data: pl.DataFrame,
    col1: str = "dotprod",
    col2: str = "cos_sim"
) -> None:
    corr_dotprod_cos = data.select(pl.col(col1, col2)).corr()\
            [0, col2]
    msg = "Correlation between dot product and cosine similarity "
    msg += f"across all movies: {corr_dotprod_cos:.3f}"
    print(msg)
    

In [162]:
cfg = pl.Config()
cfg.set_tbl_rows(2000)
rank_similarity("Beavis and Butt-head Do America", "dotprod")

Correlation between dot product and cosine similarity across all movies: 0.750


index,dotprod,cos_sim,movie_title
u32,f32,f32,str
34,192.016571,0.839823,"""South Park"""
43,191.851791,0.847036,"""30 Minutes or Less"""
25,190.767944,0.883323,"""Monkeybone"""
36,186.909409,0.833314,"""Toy Story"""
32,185.124084,0.807942,"""Shrek"""
13,184.71225,0.805794,"""Finding Nemo"""
41,184.484756,0.830212,"""15 Minutes"""
48,184.111008,0.808486,"""Air Force One"""
30,183.412445,0.841352,"""The Rescuers Down Under"""
44,181.815918,0.838162,"""48 Hrs"""
