# Book Recommendation Engine

Importing necessary libraries

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from os import listdir
from os.path import isfile, join
from load_sql import Loader

pd.set_option('display.max_columns', 10)

In [2]:
# Load sql data to DataFrames and save them in folder "data"
Loader.load_sql_to_df()

In [3]:
path = '.\\data'

# Find all files in dir 'path' and unpickle them to dfs
allfiles = [f for f in listdir(path) if isfile(join(path, f))]
print(allfiles)

dfs = {file.strip('.pickle'):pd.read_pickle(f'.\\data\\{file}') for file in allfiles}

books = dfs['books_df']
books.drop(['image_URL_S', 'image_URL_M', 'image_URL_L'], axis=1, inplace=True)
# users = dfs['users_df']
# ratings = dfs['ratings_df']
user_ratings = dfs['user_ratings_df']

['books_df.pickle', 'ratings_df.pickle', 'users_df.pickle', 'user_ratings_df.pickle']


In [4]:
books.loc[:, ['ISBN']]

Unnamed: 0,ISBN
0,0195153448
1,0002005018
2,0060973129
3,0374157065
4,0393045218
...,...
271373,0440400988
271374,0525447644
271375,006008667X
271376,0192126040


In [5]:
# Dropping incorrect ISBN format in books df
print(books.shape)
books['correct_ISBN'] = books['ISBN'].str.isnumeric()
print(books['correct_ISBN'].value_counts())
books.drop(books[books['correct_ISBN'] == False].index, inplace=True)
books.drop(['correct_ISBN'], axis=1, inplace=True)
print(books.shape)

books['correct_len'] = (books['ISBN'].str.len() == 10) | (books['ISBN'].str.len() == 13)
books.drop(books[books['correct_len'] == False].index, inplace=True)
books.drop(['correct_len'], axis=1, inplace=True)
print(books.shape)

books


(271378, 5)
True     249039
False     22339
Name: correct_ISBN, dtype: int64
(249039, 5)
(249039, 5)


Unnamed: 0,ISBN,book_title,book_author,year_of_publication,publisher
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company
...,...,...,...,...,...
271372,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books
271373,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm)
271374,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books
271376,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press


In [6]:
# Checking a correct ISBN format in user_ratings df
print(user_ratings.shape)
user_ratings['correct_ISBN'] = user_ratings['ISBN'].str.isnumeric()
print(user_ratings['correct_ISBN'].value_counts())
user_ratings.drop(user_ratings[user_ratings['correct_ISBN'] == False].index, inplace=True)
user_ratings.drop(['correct_ISBN'], axis=1, inplace=True)
print(user_ratings.shape)

user_ratings['correct_len'] = (user_ratings['ISBN'].str.len() == 10) | (user_ratings['ISBN'].str.len() == 13)
user_ratings.drop(user_ratings[user_ratings['correct_len'] == False].index, inplace=True)
user_ratings.drop(['correct_len'], axis=1, inplace=True)
print(user_ratings.shape)

user_ratings

(1090421, 5)
True     998935
False     91486
Name: correct_ISBN, dtype: int64
(998935, 5)
(991939, 5)


Unnamed: 0,user_id,user_location,age,ISBN,book_rating
0,276747,"iowa city, iowa, usa",25,0060517794,9
1,276747,"iowa city, iowa, usa",25,0451192001,0
2,276747,"iowa city, iowa, usa",25,0609801279,0
3,276747,"iowa city, iowa, usa",25,0671537458,9
4,276747,"iowa city, iowa, usa",25,0679776818,8
...,...,...,...,...,...
1090415,261528,"plano, texas, usa",24,0345370775,9
1090417,261528,"plano, texas, usa",24,0380013924,0
1090418,261528,"plano, texas, usa",24,0380015390,4
1090419,261528,"plano, texas, usa",24,0451161351,8


In [7]:
user_ratings['book_rating'] = pd.to_numeric(user_ratings['book_rating'])
user_ratings.dtypes

user_id           object
user_location     object
age               object
ISBN              object
book_rating      float64
dtype: object

In [8]:
# Ensuring reasonable rating
print(user_ratings.shape)
user_ratings = user_ratings[user_ratings['book_rating'].isin([1,2,3,4,5,6,7,8,9])]
print(user_ratings.shape)

(991939, 5)
(306362, 5)


In [9]:
# Group the 'grouped_rating' df by 'ISBN' and join the 'book_rating' values in the 'book_rating' column separated by a comma
user_ratings = user_ratings.astype(str)
grouped_rating = user_ratings.groupby('ISBN')['book_rating'].apply(lambda x: ', '.join(x)).reset_index()
grouped_rating

Unnamed: 0,ISBN,book_rating
0,0000000000,"9.0, 7.0, 7.0"
1,0000000000000,"8.0, 7.0"
2,0000000010,8.0
3,0000000016964,7.0
4,0000000020,9.0
...,...,...
139886,9997555635,8.0
139887,9999980538,3.0
139888,9999983332,8.0
139889,9999999999,5.0


In [10]:
user_ratings['book_rating'] = user_ratings['book_rating'].astype(float)
try:
    avg_rating = user_ratings.groupby(['ISBN'])['book_rating'].mean()
except:
    avg_rating = 0

avg_rating

ISBN
0000000000       7.666667
0000000000000    7.500000
0000000010       8.000000
0000000016964    7.000000
0000000020       9.000000
                   ...   
9997555635       8.000000
9999980538       3.000000
9999983332       8.000000
9999999999       5.000000
9999999999999    5.000000
Name: book_rating, Length: 139891, dtype: float64

In [11]:
grouped_rating = grouped_rating.merge(avg_rating, on='ISBN', how='left')
grouped_rating

Unnamed: 0,ISBN,book_rating_x,book_rating_y
0,0000000000,"9.0, 7.0, 7.0",7.666667
1,0000000000000,"8.0, 7.0",7.500000
2,0000000010,8.0,8.000000
3,0000000016964,7.0,7.000000
4,0000000020,9.0,9.000000
...,...,...,...
139886,9997555635,8.0,8.000000
139887,9999980538,3.0,3.000000
139888,9999983332,8.0,8.000000
139889,9999999999,5.0,5.000000


In [12]:
books = books.merge(grouped_rating, on='ISBN', how='left')
print(books.shape)
books = books[books['book_rating_x'].notnull()]
print(books.shape)

books

(249039, 7)
(115150, 7)


Unnamed: 0,ISBN,book_title,book_author,year_of_publication,publisher,book_rating_x,book_rating_y
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"8.0, 9.0, 5.0, 8.0, 8.0, 9.0, 9.0, 7.0",7.875000
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,"8.0, 7.0",7.500000
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"6.0, 8.0, 9.0, 8.0, 6.0",7.400000
5,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,"9.0, 8.0, 8.0, 7.0, 9.0, 8.0, 9.0, 9.0, 7.0, 9...",7.785714
6,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,8.0,8.000000
...,...,...,...,...,...,...,...
248859,1575661853,The Ultimate Unauthorized Star Wars Trilogy Tr...,James Hatfield,1997,Kensington Pub Corp,6.0,6.000000
248860,1879483378,Milwaukee for Free (Or the Next Thing to It,Susan Rice,1997,Trails Media Group,8.0,8.000000
248864,0307129683,The Trojan Horse (Crayola Kids Adventures),Justine Korman,1997,Golden Books,7.0,7.000000
248874,0679861548,Wolverine: Duty and Honor (X-Men Marvel Comics),Francine Hughes,1994,Random House Children's Books,7.0,7.000000


In [13]:
# Creating a new column 'tag' that contains: title, author, year of publication, and publisher
books = books.astype(str)
books['tag'] = books['book_title'] + ', ' + books['book_author'] +  ', ' + books['year_of_publication'] +  ', ' + books['publisher'] +  ', ' + books['book_rating_y']
books = books.head(15000)
books

Unnamed: 0,ISBN,book_title,book_author,year_of_publication,publisher,book_rating_x,book_rating_y,tag
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"8.0, 9.0, 5.0, 8.0, 8.0, 9.0, 9.0, 7.0",7.875,"Clara Callan, Richard Bruce Wright, 2001, Harp..."
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,"8.0, 7.0",7.5,"Decision in Normandy, Carlo D'Este, 1991, Harp..."
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"6.0, 8.0, 9.0, 8.0, 6.0",7.4,Flu: The Story of the Great Influenza Pandemic...
5,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,"9.0, 8.0, 8.0, 7.0, 9.0, 8.0, 9.0, 9.0, 7.0, 9...",7.785714285714286,"The Kitchen God's Wife, Amy Tan, 1991, Putnam ..."
6,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,8.0,8.0,What If?: The World's Foremost Military Histor...
...,...,...,...,...,...,...,...,...
19508,1567403263,Cloud Nine,Luanne Rice,2000,Paperback Nova Audio,9.0,9.0,"Cloud Nine, Luanne Rice, 2000, Paperback Nova ..."
19509,0449002268,News of the Spirit (Ballantine Reader's Circle),LEE SMITH,1998,Ballantine Books,7.0,7.0,News of the Spirit (Ballantine Reader's Circle...
19511,0553345834,What They Don't Teach You at Harvard Business ...,Mark H. McCormack,1988,Bantam,"9.0, 8.0",8.5,What They Don't Teach You at Harvard Business ...
19513,0805058001,Aveda Rituals : A Daily Guide to Natural Healt...,Horst Rechelbacher,1999,Owl Publishing Company,"9.0, 7.0",8.0,Aveda Rituals : A Daily Guide to Natural Healt...


### Bag of Words vs. TF-IDF

Both BoW and TF-IDF are techniques that help us convert text sentences into numeric vectors.
*Word Embedding* is one such technique where we can represent the text using vectors. The more popular forms of word embeddings are:

* **BoW**, which stands for Bag of Words
* **TF-IDF**, which stands for Term Frequency-Inverse Document Frequency
 

#### Bag of Words (BoW) Model
The BoW model is the simplest form of text representation in numbers.

*Example:*
* Review 1: This movie is very scary and long
* Review 2: This movie is not scary and is slow
* Review 3: This movie is spooky and good

We will first build a vocabulary from all the unique words in the above three reviews. The vocabulary consists of these 11 words: ‘This’, ‘movie’, ‘is’, ‘very’, ‘scary’, ‘and’, ‘long’, ‘not’,  ‘slow’, ‘spooky’,  ‘good’.

We can now take each of these words and mark their occurrence in the three movie reviews above with 1s and 0s. This will give us 3 vectors for 3 reviews:

* Vector of Review 1: [1 1 1 1 1 1 1 0 0 0 0]
* Vector of Review 2: [1 1 2 0 0 1 1 0 1 0 0]
* Vector of Review 3: [1 1 1 0 0 0 1 0 0 1 1]

And that’s the core idea behind a Bag of Words (BoW) model.

**Drawbacks of using a Bag-of-Words (BoW) Model**
1. If the new sentences contain new words, then our vocabulary size would increase and thereby, the length of the vectors would increase too.
2. Additionally, the vectors would also contain many 0s, thereby resulting in a sparse matrix (which is what we would like to avoid)
3. We are retaining no information on the grammar of the sentences nor on the ordering of the words in the text.

#### Term Frequency-Inverse Document Frequency (TF-IDF) Model
*Definition: “Term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.”*

**Term Frequency (TF)**
Let’s first understand Term Frequent (TF). It is a measure of how frequently a term, t, appears in a document, d:

Term Frequency (tf) formula:

$tf_{t,d} = \frac{n_{t,d}}{Number \: of \: terms \: in \: the \: document}$

Here, in the numerator, n is the number of times the term “t” appears in the document “d”. Thus, each document and term would have its own TF value.

Taking into consideration $\rightarrow$ Review 2: This movie is not scary and is slow

Here,

* Vocabulary: ‘This’, ‘movie’, ‘is’, ‘very’, ‘scary’, ‘and’, ‘long’, ‘not’,  ‘slow’, ‘spooky’,  ‘good’
* Number of words in Review 2 = 8
* TF for the word ‘this’ = (number of times ‘this’ appears in review 2)/(number of terms in review 2) = 1/8

* TF(‘movie’) = 1/8
* TF(‘is’) = 2/8 = 1/4
* TF(‘very’) = 0/8 = 0
* TF(‘scary’) = 1/8
* TF(‘and’) = 1/8
* TF(‘long’) = 0/8 = 0
* TF(‘not’) = 1/8
* TF(‘slow’) = 1/8
* TF( ‘spooky’) = 0/8 = 0
* TF(‘good’) = 0/8 = 0

Inverse Document Frequency (IDF)
IDF is a measure of how important a term is. We need the IDF value because computing just the TF alone is not sufficient to understand the importance of words:

**Inverser Document Frequency(IDF) formula**

$idf_{t} = \frac{Number \: of \: documnets}{Number \: of \: documents \: with \: term \: t}$

We can calculate the IDF values for the all the words in Review 2:

* IDF(‘this’) =  log(number of documents/number of documents containing the word ‘this’) = log(3/3) = log(1) = 0

Similarly,

* IDF(‘movie’, ) = log(3/3) = 0
* IDF(‘is’) = log(3/3) = 0
* IDF(‘not’) = log(3/1) = log(3) = 0.48
* IDF(‘scary’) = log(3/2) = 0.18
* IDF(‘and’) = log(3/3) = 0
* IDF(‘slow’) = log(3/1) = 0.48

Hence, we see that words like “is”, “this”, “and”, etc., are reduced to 0 and have little importance; while words like “scary”, “long”, “good”, etc. are words with more importance and thus have a higher value.

We can now compute the TF-IDF score for each word in the corpus. Words with a higher score are more important, and those with a lower score are less important:

**TF_IDF formula**

$tf-idf_{t,d} = tf_{t,d} * idf_t$

We can now calculate the TF-IDF score for every word in Review 2:

* TF-IDF(‘this’, Review 2) = TF(‘this’, Review 2) * IDF(‘this’) = 1/8 * 0 = 0

Similarly,

* TF-IDF(‘movie’, Review 2) = 1/8 * 0 = 0
* TF-IDF(‘is’, Review 2) = 1/4 * 0 = 0
* TF-IDF(‘not’, Review 2) = 1/8 * 0.48 = 0.06
* TF-IDF(‘scary’, Review 2) = 1/8 * 0.18 = 0.023
* TF-IDF(‘and’, Review 2) = 1/8 * 0 = 0
* TF-IDF(‘slow’, Review 2) = 1/8 * 0.48 = 0.06

### BoW

In [14]:
# Extract the book titles and tags into separate lists
titles = books['book_title'].tolist()
tags = books['tag'].str.strip().str.split(",").tolist()

# Create a bag of words representation of the book tags
def create_bow(tag_list):
    bow = {}
    if not isinstance(tag_list, float):
        for tag in tag_list:
            bow[tag] = 1
    return bow
     

# Create a list of bags of words representations of the book tags
bags_of_words = [create_bow(book_tags) for book_tags in tags]
bags_of_words

[{'Clara Callan': 1,
  ' Richard Bruce Wright': 1,
  ' 2001': 1,
  ' HarperFlamingo Canada': 1,
  ' 7.875': 1},
 {'Decision in Normandy': 1,
  " Carlo D'Este": 1,
  ' 1991': 1,
  ' HarperPerennial': 1,
  ' 7.5': 1},
 {'Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It': 1,
  ' Gina Bari Kolata': 1,
  ' 1999': 1,
  ' Farrar Straus Giroux': 1,
  ' 7.4': 1},
 {"The Kitchen God's Wife": 1,
  ' Amy Tan': 1,
  ' 1991': 1,
  ' Putnam Pub Group': 1,
  ' 7.785714285714286': 1},
 {"What If?: The World's Foremost Military Historians Imagine What Might Have Been": 1,
  ' Robert Cowley': 1,
  ' 2000': 1,
  ' Berkley Publishing Group': 1,
  ' 8.0': 1},
 {'PLEADING GUILTY': 1,
  ' Scott Turow': 1,
  ' 1993': 1,
  ' Audioworks': 1,
  ' 8.0': 1},
 {'Nights Below Station Street': 1,
  ' David Adams Richards': 1,
  ' 1988': 1,
  ' Emblem Editions': 1,
  ' 6.0': 1},
 {'The Middle Stories': 1,
  ' Sheila Heti': 1,
  ' 2004': 1,
  ' House of Anansi Press': 1,

In [15]:
# Create a dataframe to store the bags of words representation of the book tags
tag_df = pd.DataFrame(bags_of_words, index=titles).fillna(0)

# Calculate the cosine similarity matrix between the books
cos_similarity = cosine_similarity(tag_df)

# Create a dataframe with the cosine similarity scores
similarity_df = pd.DataFrame(cos_similarity, index=tag_df.index, columns=tag_df.index)
similarity_df

Unnamed: 0,Clara Callan,Decision in Normandy,Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,The Kitchen God's Wife,What If?: The World's Foremost Military Historians Imagine What Might Have Been,...,Cloud Nine,News of the Spirit (Ballantine Reader's Circle),What They Don't Teach You at Harvard Business School,Aveda Rituals : A Daily Guide to Natural Health and Beauty,Three Black Skirts : All You Need To Survive
Clara Callan,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
Decision in Normandy,0.0,1.0,0.0,0.2,0.0,...,0.0,0.0,0.0,0.0,0.0
Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.2,0.0
The Kitchen God's Wife,0.0,0.2,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0
What If?: The World's Foremost Military Historians Imagine What Might Have Been,0.0,0.0,0.0,0.0,1.0,...,0.2,0.0,0.0,0.2,0.4
...,...,...,...,...,...,...,...,...,...,...,...
Cloud Nine,0.0,0.0,0.0,0.0,0.2,...,1.0,0.0,0.0,0.0,0.2
News of the Spirit (Ballantine Reader's Circle),0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0
What They Don't Teach You at Harvard Business School,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0
Aveda Rituals : A Daily Guide to Natural Health and Beauty,0.0,0.0,0.2,0.0,0.2,...,0.0,0.0,0.0,1.0,0.2


In [16]:
books[books['book_author'].str.contains('Shakespeare')]

Unnamed: 0,ISBN,book_title,book_author,year_of_publication,publisher,book_rating_x,book_rating_y,tag
369,198320264,Julius Caesar (Oxford School Shakespeare),William Shakespeare,2001,Oxford University Press,2.0,2.0,"Julius Caesar (Oxford School Shakespeare), Wil..."
1359,174434642,Othello (3rd Series),William Shakespeare,1996,Thomas Nelson Publishers,8.0,8.0,"Othello (3rd Series), William Shakespeare, 199..."
2084,671722727,King Lear,William Shakespeare,1993,Washington Square Press,"8.0, 5.0",6.5,"King Lear, William Shakespeare, 1993, Washingt..."
2178,671722816,Othello,William Shakespeare,1993,Washington Square Press,"7.0, 5.0",6.0,"Othello, William Shakespeare, 1993, Washington..."
3067,451521285,The Tragedy of Hamlet Prince of Denmark (Signe...,William Shakespeare,1993,Signet Classics,"8.0, 5.0, 7.0, 8.0, 9.0, 5.0, 5.0",6.714285714285714,The Tragedy of Hamlet Prince of Denmark (Signe...
3081,451521358,Macbeth,William Shakespeare,1989,New Amer Library (Mm),"8.0, 8.0",8.0,"Macbeth, William Shakespeare, 1989, New Amer L..."
3454,671531409,Midsummer Nights Dream,William Shakespeare,1984,Pocket Books,7.0,7.0,"Midsummer Nights Dream, William Shakespeare, 1..."
4522,671722948,TWELFTH NIGHT,William Shakespeare,1993,Washington Square Press,8.0,8.0,"TWELFTH NIGHT, William Shakespeare, 1993, Wash..."
4549,451526767,"Twelfth Night, Or, What You Will: With New and...",William Shakespeare,1998,Signet Book,9.0,9.0,"Twelfth Night, Or, What You Will: With New and..."
5654,671722840,RICHARD III,William Shakespeare,1996,Washington Square Press,"9.0, 5.0",7.0,"RICHARD III, William Shakespeare, 1996, Washin..."


In [17]:
# Ask the user for a book they like
book = input('Enter a book you like: ')

# Find the index of the book in the similarity dataframe
book_index = similarity_df.index.get_loc(book)

# Get the top 10 most similar books to the book
top_10 = similarity_df.iloc[book_index].sort_values(ascending=False)[1:11]

# Print the top 10 most similar books to the book
print(f'Top 10 similar books to {book}:')
top_10

Top 10 similar books to Romeo and Juliet:


ROMEO JULIET                      0.4
Children of the Night             0.4
Tickled Pink: A Comic Novel       0.4
Othello                           0.4
Anton Chekhov Selected Stories    0.4
The TEMPEST                       0.4
MIDSUMMER NIGHT'S DREAM           0.4
Voices After Midnight             0.4
Black Beauty                      0.4
TWELFTH NIGHT                     0.4
Name: Romeo and Juliet, dtype: float64

### TF-IDF

In [18]:
# Create a TfidfVectorizer object to transform the book tags into a Tf-idf representation
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(books['tag'])
tfidf_matrix

<15000x20825 sparse matrix of type '<class 'numpy.float64'>'
	with 154701 stored elements in Compressed Sparse Row format>

In [19]:
# Calculate the cosine similarity between the books
cos_similarity_tfidf = cosine_similarity(tfidf_matrix)

# Create a df with the cosine similarity scores
cos_similarity_tfidf_df = pd.DataFrame(cos_similarity_tfidf, index=books['book_title'], columns=books['book_title'])

In [20]:
# Find the index of the book in the similarity dataframe
book_index = cos_similarity_tfidf_df.index.get_loc(book)

# Get the top 10 most similar books to the book
top_10 = cos_similarity_tfidf_df.iloc[book_index].sort_values(ascending=False)[1:11]

# Print the top 5 most similar books to the book
print(f'Top 10 similar books to {book}:')
top_10

Top 10 similar books to Romeo and Juliet:


book_title
ROMEO JULIET                                     0.909870
Romeo and Juliet (The Pelican Shakespeare)       0.643615
Othello                                          0.545025
RICHARD III                                      0.538747
Romeo and Juliet (Dover Thrift Editions)         0.535767
Macbeth                                          0.529408
The TEMPEST                                      0.524989
Tragedy of Romeo and Juliet (Signet Classics)    0.524600
TWELFTH NIGHT                                    0.517744
King Lear                                        0.513229
Name: Romeo and Juliet, dtype: float64