In [10]:
import numpy as np
import pandas as pd
import os, sys

from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
## path to the downloaded gutenberg corpus
path_gutenberg = os.path.join(os.pardir,os.pardir,'gutenberg')

In [12]:
## import internal helper functions
src_dir = os.path.join(os.pardir,'src')
sys.path.append(src_dir)
from data_io import get_book
from jsd import jsdalpha

## Compute dissimilarity between two books

We first load two books at the 'counts' level
- Recall that counts has structure { word : count }

In [13]:
# select your favourite two books
pg_id1 = 'PG105' # Persuasion
pg_id2 = 'PG1661' # The Adventures of Sherlock Holmes

In [14]:
level = 'counts'
counts_persuasion = get_book(pg_id1, level=level)
counts_persuasion

{'the': 3327,
 'and': 2785,
 'to': 2781,
 'of': 2568,
 'a': 1592,
 'in': 1382,
 'was': 1336,
 'her': 1204,
 'had': 1187,
 'she': 1146,
 'i': 1121,
 'it': 1036,
 'not': 976,
 'he': 960,
 'be': 950,
 'that': 880,
 'as': 810,
 'for': 707,
 'but': 664,
 'his': 659,
 'with': 654,
 'you': 626,
 'have': 589,
 'at': 533,
 'all': 530,
 'been': 495,
 'anne': 494,
 'him': 467,
 'could': 451,
 'very': 434,
 'they': 433,
 'were': 426,
 'by': 417,
 'which': 416,
 'is': 398,
 'on': 396,
 'so': 359,
 'no': 355,
 'would': 355,
 'captain': 303,
 'from': 295,
 'their': 293,
 'mrs': 291,
 'elliot': 285,
 'there': 285,
 'more': 273,
 'or': 273,
 'them': 269,
 'mr': 256,
 'this': 250,
 'an': 245,
 'than': 243,
 'one': 237,
 'must': 228,
 'when': 228,
 'my': 223,
 'being': 220,
 'only': 219,
 'lady': 215,
 'do': 215,
 'wentworth': 212,
 'such': 211,
 'much': 205,
 'if': 202,
 'any': 200,
 'what': 197,
 'who': 190,
 'should': 188,
 'me': 188,
 'little': 176,
 'said': 173,
 'good': 170,
 'might': 166,
 'charle

In [15]:
counts_sherlock = get_book(pg_id2, level=level)
counts_sherlock

{'the': 5613,
 'and': 2991,
 'i': 2990,
 'to': 2681,
 'of': 2654,
 'a': 2636,
 'in': 1761,
 'that': 1742,
 'it': 1721,
 'he': 1482,
 'you': 1466,
 'was': 1412,
 'his': 1158,
 'is': 1123,
 'my': 999,
 'have': 922,
 'as': 851,
 'had': 833,
 'with': 830,
 'which': 769,
 'at': 767,
 'for': 723,
 'not': 684,
 'but': 643,
 'me': 635,
 'be': 627,
 'we': 523,
 'there': 504,
 'from': 495,
 'said': 486,
 'this': 484,
 'upon': 465,
 'holmes': 461,
 'so': 446,
 'him': 434,
 'her': 430,
 'she': 425,
 'very': 396,
 'your': 395,
 'been': 393,
 'all': 387,
 'what': 380,
 'on': 379,
 'no': 378,
 'one': 372,
 'then': 361,
 'were': 348,
 'by': 343,
 'are': 334,
 'would': 333,
 'an': 332,
 'when': 322,
 'out': 318,
 'do': 309,
 'man': 303,
 'up': 301,
 'could': 287,
 'has': 284,
 'into': 275,
 'who': 270,
 'little': 269,
 'will': 262,
 'if': 257,
 'some': 242,
 'now': 234,
 'see': 229,
 'down': 229,
 'can': 218,
 'should': 211,
 'our': 208,
 'they': 197,
 'or': 196,
 'may': 195,
 'am': 185,
 'us': 183,
 '

### Jaccard distance

In [16]:
# Define Jaccard distance 
def jaccard_distance(b1, b2):
    b1_words = set(b1.keys())
    b2_words = set(b2.keys())
    union = b1_words.union(b2_words)
    intersection = b1_words.intersection(b2_words)
    return 1 - len(intersection)/len(union)

In [17]:
# Compute Jaccard distance between Persuasion and Sherlock Holmes
dissimilarity_jaccard = jaccard_distance(counts_persuasion, counts_sherlock)
dissimilarity_jaccard

0.6799721282102329

### Jensen-Shannon divergence

In [18]:
# Load jsd function
from jsd import jsdalpha

In [19]:
alpha = 1
dissimilarity_jsd = jsdalpha(counts_persuasion, counts_sherlock, alpha=alpha)
dissimilarity_jsd

0.12231363072666213

### Euclidean distance between embedded books

We first load two books at the 'tokens' level
- Recall that tokens gives a list of tokens, where each token is a string

In [20]:
level = 'tokens'
tokens_persuasion = get_book(pg_id1, level=level)
tokens_persuasion

['by',
 'al',
 'haines',
 'persuasion',
 'by',
 'jane',
 'austen',
 'chapter',
 'sir',
 'walter',
 'elliot',
 'of',
 'kellynch',
 'hall',
 'in',
 'somersetshire',
 'was',
 'a',
 'man',
 'who',
 'for',
 'his',
 'own',
 'amusement',
 'never',
 'took',
 'up',
 'any',
 'book',
 'but',
 'the',
 'baronetage',
 'there',
 'he',
 'found',
 'occupation',
 'for',
 'an',
 'idle',
 'hour',
 'and',
 'consolation',
 'in',
 'a',
 'distressed',
 'one',
 'there',
 'his',
 'faculties',
 'were',
 'roused',
 'into',
 'admiration',
 'and',
 'respect',
 'by',
 'contemplating',
 'the',
 'limited',
 'remnant',
 'of',
 'the',
 'earliest',
 'patents',
 'there',
 'any',
 'unwelcome',
 'sensations',
 'arising',
 'from',
 'domestic',
 'affairs',
 'changed',
 'naturally',
 'into',
 'pity',
 'and',
 'contempt',
 'as',
 'he',
 'turned',
 'over',
 'the',
 'almost',
 'endless',
 'creations',
 'of',
 'the',
 'last',
 'century',
 'and',
 'there',
 'if',
 'every',
 'other',
 'leaf',
 'were',
 'powerless',
 'he',
 'could',


In [21]:
tokens_sherlock = get_book(pg_id2, level=level)
tokens_sherlock

['the',
 'adventures',
 'of',
 'sherlock',
 'holmes',
 'by',
 'sir',
 'arthur',
 'conan',
 'doyle',
 'i',
 'a',
 'scandal',
 'in',
 'bohemia',
 'ii',
 'the',
 'league',
 'iii',
 'a',
 'case',
 'of',
 'identity',
 'iv',
 'the',
 'boscombe',
 'valley',
 'mystery',
 'the',
 'five',
 'orange',
 'pips',
 'vi',
 'the',
 'man',
 'with',
 'the',
 'twisted',
 'lip',
 'vii',
 'the',
 'adventure',
 'of',
 'the',
 'blue',
 'carbuncle',
 'viii',
 'the',
 'adventure',
 'of',
 'the',
 'speckled',
 'band',
 'ix',
 'the',
 'adventure',
 'of',
 'the',
 'engineer',
 'thumb',
 'x',
 'the',
 'adventure',
 'of',
 'the',
 'noble',
 'bachelor',
 'xi',
 'the',
 'adventure',
 'of',
 'the',
 'beryl',
 'coronet',
 'xii',
 'the',
 'adventure',
 'of',
 'the',
 'copper',
 'beeches',
 'adventure',
 'i',
 'a',
 'scandal',
 'in',
 'bohemia',
 'i',
 'to',
 'sherlock',
 'holmes',
 'she',
 'is',
 'always',
 'the',
 'woman',
 'i',
 'have',
 'seldom',
 'heard',
 'him',
 'mention',
 'her',
 'under',
 'any',
 'other',
 'name'

In [33]:
# Import SentenceTransformer and our embedding function
from sentence_transformers import SentenceTransformer

In [23]:
# Load the model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


In [26]:
# Function for embedding books (see src/pretrained_embedding.py)
def embed_book_tokens(book, model, max_length):
    '''
    Generates an embedding of the given book.

    Parameters
    ----------
    book : list
        A book with level = 'tokens'. This is a list of words, in the order they appear 
        in the original text.
    
    model : embedding model
        The model used to embed the sentences in the book.
    
    max_length : integer
        The maximum number of tokens being input to the embedding model at once.
    
    Returns
    -------
     : numpy array
        The embedded book, represented by a numpy array.
    '''
    stored_strings = []
    stored_lengths = []

    i = 0
    while i < len(book):
        if len(book) - i >= max_length:
            emb_input = book[i:i + max_length]
        else:
            emb_input = book[i:len(book)]
    
        # Past the words together
        emb_input_str = " ".join(emb_input)

        # Store input strings
        stored_strings.append(emb_input_str)
        stored_lengths.append(len(emb_input))

        i += max_length

    embeddings = model.encode(stored_strings)
    if len(embeddings) == 0:
        embedded_book = None
    else:
        embedded_book = sum(embeddings)/len(embeddings)

    return embedded_book

In [27]:
# Embed Persuasion
max_length = 256 - 50
persuasion_embedded = embed_book_tokens(tokens_persuasion, model, max_length)

In [28]:
# Embed Sherlock Holmes
sherlock_embedded = embed_book_tokens(tokens_sherlock, model, max_length)

In [31]:
# Define angular distance
from numpy.linalg import norm

def angular_distance(vec1, vec2):
    cosine_sim = np.inner(vec1, vec2) / (norm(vec1) * norm(vec2))
    return np.arccos(cosine_sim)/np.pi

In [32]:
dissimilarity_angular = angular_distance(persuasion_embedded, sherlock_embedded)
dissimilarity_angular

0.2236446975160141