In [15]:
import pandas as pd

In [16]:
# CMU dataset with plots (https://www.kaggle.com/ymaricar/cmu-book-summary-dataset)
cmu_data = pd.read_csv('data/booksummaries.txt',sep='\t',lineterminator='\n', 
                        names=["wiki_id", "firebase_id", "book_title","published", "author","genres",'summary'])

print ('Before Nan dropping: ', len(cmu_data))
cmu_data.dropna(subset=['book_title', 'summary'],inplace=True)
print ('After Nan dropping: ', len(cmu_data))
cmu_data.head(5)

Before Nan dropping:  16559
After Nan dropping:  16559


Unnamed: 0,wiki_id,firebase_id,book_title,published,author,genres,summary
0,620,/m/0hhy,Animal Farm,George Orwell,1945-08-17,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca..."
1,843,/m/0k36,A Clockwork Orange,Anthony Burgess,1962,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."
2,986,/m/0ldx,The Plague,Albert Camus,1947,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...
3,1756,/m/0sww,An Enquiry Concerning Human Understanding,David Hume,,,The argument of the Enquiry proceeds by a ser...
4,2080,/m/0wkt,A Fire Upon the Deep,Vernor Vinge,,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...


In [17]:
# Goodreads dataset (https://github.com/zygmuntz/goodbooks-10k)
books = pd.read_csv('data/goodbooks-10k/books.csv')
book_tags = pd.read_csv('data/goodbooks-10k/book_tags.csv')
ratings = pd.read_csv('data/goodbooks-10k/ratings.csv')
tags = pd.read_csv('data/goodbooks-10k/tags.csv')
to_read = pd.read_csv('data/goodbooks-10k/to_read.csv')

print ('Before Nan dropping: ', len(books))
books.dropna(subset=['original_title'],inplace=True)
print ('After Nan dropping: ', len(books))
books.head()

Before Nan dropping:  10000
After Nan dropping:  9415


Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [18]:
# preprocessing
import re
def preprocess(sent):
    '''
    cleanses sentences
    '''
    sent = re.sub(r'\W+', '', sent.lower())    
    return sent

In [19]:
# merging interaction data with summaries
cmu_data['clean_title'] = cmu_data['book_title'].apply(lambda x: preprocess(x))
books['clean_title'] = books['original_title'].apply(lambda x: preprocess(x))

common = pd.merge(cmu_data, books ,how='inner')
print (len(common))

3239


In [20]:
common.columns

Index(['wiki_id', 'firebase_id', 'book_title', 'published', 'author', 'genres',
       'summary', 'clean_title', 'book_id', 'goodreads_book_id',
       'best_book_id', 'work_id', 'books_count', 'isbn', 'isbn13', 'authors',
       'original_publication_year', 'original_title', 'title', 'language_code',
       'average_rating', 'ratings_count', 'work_ratings_count',
       'work_text_reviews_count', 'ratings_1', 'ratings_2', 'ratings_3',
       'ratings_4', 'ratings_5', 'image_url', 'small_image_url'],
      dtype='object')

In [21]:
ls = [x for x in common['clean_title'] if 'harrypotter' in x]
ls

['harrypotterandthephilosophersstone',
 'harrypotterandtheorderofthephoenix',
 'harrypotterandthechamberofsecrets',
 'harrypotterandtheprisonerofazkaban',
 'harrypotterandthegobletoffire',
 'harrypotterandthedeathlyhallows',
 'harrypotterandthehalfbloodprince']

In [22]:
sum1 = list(common.loc[common['clean_title'] == 'harrypotterandthephilosophersstone']['summary'])[0]
sum2 = list(common.loc[common['clean_title'] == 'harrypotterandthegobletoffire']['summary'])[0]
sum3 = list(common.loc[common['clean_title'] == 'thethirtyninesteps']['summary'])[0]

In [23]:
# Embedding verification (https://www.shanelynn.ie/word-embeddings-in-python-with-spacy-and-gensim/)
import spacy
en_nlp = spacy.load('en')
doc_1 = en_nlp(sum1)
doc_2 = en_nlp(sum2)
doc_3 = en_nlp(sum3)

In [24]:
# cosine similarity
print ('Similarity: ',doc_1.similarity(doc_2))

# cosine similarity
print ('Similarity: ',doc_1.similarity(doc_3))


Similarity:  0.9926717587482563
Similarity:  0.9810914525943444


  "__main__", mod_spec)
  "__main__", mod_spec)


In [25]:
# Second embedding
import gensim
from multiprocessing import cpu_count
from nltk.tokenize import sent_tokenize

data = list()
for summary in cmu_data['summary']:
    sent = sent_tokenize(summary)
    data.extend([x.split() for x in sent])
    
# Training word2vec model
embedding_inst = gensim.models.Word2Vec(data, min_count=2, size=100, workers=cpu_count(), window=5)

In [29]:
import numpy as np
def doc_vec(word2vec, sentences):
    '''
    Gets list of sentences and returns mean vector
    '''
    return np.mean(np.array(
                [np.mean(
                    [word2vec[word] for word in document.split() 
                    if word in word2vec]
                or [np.zeros(self.dim)], axis=0)
            for document in sentences
        ]), axis=0)

In [30]:
doc_1_vec = doc_vec(embedding_inst, sent_tokenize(sum1))
doc_2_vec = doc_vec(embedding_inst, sent_tokenize(sum2))
doc_3_vec = doc_vec(embedding_inst, sent_tokenize(sum3))

  if __name__ == '__main__':
  if __name__ == '__main__':


In [28]:
from numpy import dot
from numpy.linalg import norm

cos_sim = dot(doc_1_vec, doc_2_vec)/(norm(doc_1_vec)*norm(doc_2_vec))
print (cos_sim)
cos_sim = dot(doc_1_vec, doc_3_vec)/(norm(doc_1_vec)*norm(doc_3_vec))
print (cos_sim)

0.9944545
0.983453
