In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('book_data.csv')

In [3]:
print(df.shape)
df.tail()

(54301, 12)


Unnamed: 0,book_authors,book_desc,book_edition,book_format,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url
54296,Howard Megdal,"In this fearless and half-crazy story, Howard ...",,Hardcover,9781610000000.0,256 pages,3.37,27,9,Taking the Field: A Fan's Quest to Run the Tea...,Sports|Baseball|Sports and Games|Sports|Nonfic...,https://images.gr-assets.com/books/1312074392l...
54297,Howard Megdal,From the icons of the game to the players who ...,,Hardcover,9780060000000.0,256 pages,3.97,34,5,"The Baseball Talmud: Koufax, Greenberg, and th...",Nonfiction|Sports and Games|Sports,https://images.gr-assets.com/books/1348841629l...
54298,Howard Megdal,,,Kindle Edition,,,3.66,32,3,"Wilpon's Folly - The Story of a Man, His Fortu...",Sports|Baseball|Abandoned,https://images.gr-assets.com/books/1394277097l...
54299,Mimi Baird|Eve Claxton,"Soon to be a major motion picture, from Brad P...",,Hardcover,9780800000000.0,272 pages,3.82,867,187,He Wanted the Moon: The Madness and Medical Ge...,Nonfiction|Autobiography|Memoir|Biography|Psyc...,https://images.gr-assets.com/books/1403192135l...
54300,Leah Price,The Anthology and the Rise of the Novel brings...,,Paperback,9780520000000.0,236 pages,3.58,12,3,The Anthology and the Rise of the Novel: From ...,Criticism|Literary Criticism|Philosophy|Theory...,https://images.gr-assets.com/books/1349014225l...


In [4]:
df.describe()

Unnamed: 0,book_rating,book_rating_count,book_review_count
count,54301.0,54301.0,54301.0
mean,4.020027,43504.49,2011.60218
std,0.3621,212657.2,7627.07287
min,0.0,0.0,0.0
25%,3.83,407.0,35.0
50%,4.03,2811.0,188.0
75%,4.22,12745.0,822.0
max,5.0,5588580.0,160776.0


## Eliminating duplicates and rows with no book ratings

In [5]:
df = df[df['book_rating_count'] != 0]

In [6]:
df.isnull().sum()

book_authors             0
book_desc             1317
book_edition         48779
book_format           1649
book_isbn            12829
book_pages            2495
book_rating              0
book_rating_count        0
book_review_count        0
book_title               0
genres                3170
image_url              663
dtype: int64

In [7]:
df.shape

(54226, 12)

In [8]:
df = df.drop_duplicates(subset ="book_title")
print(df.shape)

(48412, 12)


In [9]:
df = df.drop_duplicates(subset ="book_desc")
print(df.shape)

(46712, 12)


In [10]:
df.isnull().sum()

book_authors             0
book_desc                1
book_edition         42204
book_format           1222
book_isbn            10339
book_pages            1891
book_rating              0
book_rating_count        0
book_review_count        0
book_title               0
genres                2625
image_url              377
dtype: int64

## Use book description as sole feature

In [11]:
df = df[['book_authors','book_desc','book_title']]
df.shape

(46712, 3)

In [12]:
df = df.dropna()
df.shape

(46711, 3)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
from nltk.stem.porter import *
from nltk.corpus import wordnet
import string
import re

## Eliminate observations with non-English words

In [14]:
def make_english(text):
    english = ' '.join([w for w in text.split() if wordnet.synsets(w)])
    return english

In [15]:
df['book_desc'] = df['book_desc'].apply(lambda x: make_english(x))

In [16]:
df['num_words'] = df['book_desc'].apply(lambda x: len(x.split()))

In [17]:
df = df[df['num_words'] > 5]

In [18]:
df.isnull().sum()

book_authors    0
book_desc       0
book_title      0
num_words       0
dtype: int64

In [19]:
df = df[['book_authors','book_desc','book_title']]
df.shape

(44548, 3)

## Eliminate punctuation, numbers and capital letters

In [20]:
def preprocess(text):
    new_text = re.sub('<.*?>', '', text)   # remove HTML tags
    new_text = re.sub("[!@#$+%*:()'-]",'',new_text) # remove punc.
    new_text = re.sub(r'\d+','',new_text)# remove numbers
    new_text = new_text.lower() # lower case, .upper() for upper
    return new_text

In [21]:
df['book_desc'] = df['book_desc'].apply(lambda x: preprocess(x))

## Tokenize and remove stopwords

In [22]:
tokenizer = RegexpTokenizer(r'\w+')

In [23]:
df['book_desc'] = df['book_desc'].apply(lambda x: tokenizer.tokenize(x))

In [24]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [25]:
df['book_desc'] = df['book_desc'].apply(lambda x: remove_stopwords(x))

## Use lemmatizer and join result

In [26]:
lemmatizer = WordNetLemmatizer()
def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text

In [27]:
df['book_desc'] = df['book_desc'].apply(lambda x: word_lemmatizer(x))

In [28]:
stemmer = PorterStemmer()

In [29]:
def word_stemmer(text):
    stem_text = " ".join([stemmer.stem(i) for i in text])
    return stem_text

In [30]:
df['book_desc'] = df['book_desc'].apply(lambda x: word_stemmer(x))

## Vectorize and use cosine similarity to get top 5 results

In [31]:
documents = np.array(df['book_desc'])

In [46]:
tfidf_vectorizer = TfidfVectorizer()

In [47]:
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

## Create preprocessor function

In [37]:
def preprocessor(text):
    new_text = re.sub('<.*?>', '', text)   # remove HTML tags
    new_text = re.sub("[!@#$+%*:()'-]",'',new_text) # remove punc.
    new_text = re.sub(r'\d+','',new_text)# remove numbers
    new_text = new_text.lower() # lower case, .upper() for upper
    tokenized_text = tokenizer.tokenize(new_text)
    words = [w for w in tokenized_text if w not in stopwords.words('english')]
    lem_text = [lemmatizer.lemmatize(i) for i in words]
    stem_text = " ".join([stemmer.stem(i) for i in lem_text])
    return stem_text

In [38]:
preprocessor('Could you survive on your own, in the wild, with everyone out to make sure you dont live to see the morning? In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and one girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV. Sixteen-year-old Katniss Everdeen, who lives alone with her mother and younger sister, regards it as a death sentence when she is forced to represent her district in the Games. But Katniss has been close to dead before - and survival, for her, is second nature. Without really meaning to, she becomes a contender. But if she is to win, she will have to start making choices that weigh survival against humanity and life against love. New York Times bestselling author Suzanne Collins delivers equal parts suspense and philosophy, adventure and romance, in this searing novel set in a future with unsettling parallels to our present.')

'could surviv wild everyon make sure dont live see morn ruin place known north america lie nation panem shine capitol surround twelv outli district capitol harsh cruel keep district line forc send one boy one girl age twelv eighteen particip annual hunger game fight death live tv sixteenyearold katniss everdeen life alon mother younger sister regard death sentenc forc repres district game katniss close dead surviv second natur without realli mean becom contend win start make choic weigh surviv human life love new york time bestsel author suzann collin deliv equal part suspens philosophi adventur romanc sear novel set futur unsettl parallel present'

## Create predictions function

In [51]:
def predictions(text):
    documents = [preprocessor(text)]
    tfidf_matrix_new = tfidf_vectorizer.transform(documents)
    array = cosine_similarity(tfidf_matrix_new, tfidf_matrix)[0]
    recommender = df.copy()
    recommender['cs'] = array
    recommender.sort_values(by=['cs'], ascending=False)
    return recommender.nlargest(5, 'cs')

In [64]:
predictions('I want to read a book about scientific discoveries and space exploration')

Unnamed: 0,book_authors,book_desc,book_title,cs
21738,Mary Roach,bestsel author stiff bonk explor irresist stra...,Packing for Mars: The Curious Science of Life ...,0.413578
2028,Carl Sagan|Ann Druyan,pulitz author trace explor space suggest survi...,Pale Blue Dot: A Vision of the Human Future in...,0.410082
38964,Piers Bizony,boldli go book gone explor come realiti person...,How To Build Your Own Spaceship: The Science O...,0.313127
53172,David Hitt|Owen Garriott|Joe Kerwin,unit state soviet union went explor space live...,Homesteading Space: The Skylab Story,0.294092
34648,Jon Butterworth,discoveri boson made headlin around two peter ...,Smashing Physics,0.283299
