In [37]:
import numpy as np
import pandas as pd

In [38]:
books = pd.read_csv('books.csv',low_memory=False)
users = pd.read_csv('users.csv')
ratings = pd.read_csv('ratings.csv')

In [39]:
books['Image-URL-M'][1]

'http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg'

In [40]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [41]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [42]:
print(books.shape)
print(ratings.shape)
print(users.shape)

(271360, 8)
(1149780, 3)
(278858, 3)


In [43]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [44]:
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [45]:
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [46]:
books.duplicated().sum()

0

In [47]:
ratings.duplicated().sum()

0

In [48]:
users.duplicated().sum()

0

## Popularity Based Recommender System

In [49]:
ratings_with_name = ratings.merge(books,on='ISBN')
ratings_with_name.head(100)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
...,...,...,...,...,...,...,...,...,...,...
95,85993,0446520802,10,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
96,91761,0446520802,6,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
97,98153,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
98,99085,0446520802,10,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...


In [50]:
num_rating_df = ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating':'num_ratings'},inplace=True)
num_rating_df

Unnamed: 0,Book-Title,num_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3


In [51]:
avg_rating_df = ratings_with_name.groupby('Book-Title')['Book-Rating'].mean().reset_index()
avg_rating_df.rename(columns={'Book-Rating':'avg_rating'},inplace=True)
avg_rating_df

Unnamed: 0,Book-Title,avg_rating
0,A Light in the Storm: The Civil War Diary of ...,2.250000
1,Always Have Popsicles,0.000000
2,Apple Magic (The Collector's series),0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,0.000000
...,...,...
241066,Ã?Â?lpiraten.,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,5.250000
241068,Ã?Â?sterlich leben.,7.000000
241069,Ã?Â?stlich der Berge.,2.666667


In [52]:
popular_df = num_rating_df.merge(avg_rating_df,on='Book-Title')
popular_df


Unnamed: 0,Book-Title,num_ratings,avg_rating
0,A Light in the Storm: The Civil War Diary of ...,4,2.250000
1,Always Have Popsicles,1,0.000000
2,Apple Magic (The Collector's series),1,0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.000000
...,...,...,...
241066,Ã?Â?lpiraten.,2,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,4,5.250000
241068,Ã?Â?sterlich leben.,1,7.000000
241069,Ã?Â?stlich der Berge.,3,2.666667


In [53]:
popular_df = popular_df[(popular_df['num_ratings'] >= 100) & (popular_df['avg_rating'] >=4.8)].sort_values('avg_rating', ascending=False)

In [54]:
popular_df = popular_df.merge(books,on='Book-Title').drop_duplicates('Book-Title')[['Book-Title','Book-Author','Image-URL-M','num_ratings','avg_rating']]

In [55]:
popular_df

Unnamed: 0,Book-Title,Book-Author,Image-URL-M,num_ratings,avg_rating
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,http://images.amazon.com/images/P/0439136350.0...,428,5.852804
3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,http://images.amazon.com/images/P/0439139597.0...,387,5.824289
5,The Little Prince,Antoine de Saint-ExupÃ©ry,http://images.amazon.com/images/P/0156528207.0...,141,5.815603
10,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,http://images.amazon.com/images/P/0590353403.0...,278,5.73741
14,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,http://images.amazon.com/images/P/043935806X.0...,347,5.501441
18,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,http://images.amazon.com/images/P/0312853238.0...,249,5.409639
22,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,http://images.amazon.com/images/P/0439064872.0...,556,5.183453
25,The Perks of Being a Wallflower,Stephen Chbosky,http://images.amazon.com/images/P/0671027344.0...,104,5.144231
27,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339681.0...,281,5.007117
28,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339703.0...,368,4.94837


## Collaborative Filtering Based Recommender System

In [65]:
x = ratings_with_name.groupby('User-ID').count()['Book-Rating'] > 100
nerd_users = x.loc[x==True].index

User-ID
2         False
8         False
9         False
10        False
12        False
          ...  
278846    False
278849    False
278851    False
278852    False
278854    False
Name: Book-Rating, Length: 92106, dtype: bool

In [70]:
## from rating_name df we filter only records which contain users who have 200 or more rating
filtered_rating = ratings_with_name[ratings_with_name['User-ID'].isin(nerd_users)]
y = filtered_rating.groupby('Book-Title').count()['Book-Rating']>=10
famous_books = y.loc[y==True].index

In [71]:
filtered_rating

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
5,23768,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
7,28523,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
14,77480,034545104X,8,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
...,...,...,...,...,...,...,...,...,...,...
1031115,276680,0812881311,0,Streets of Gold,Evan Hunter,1985,Madison Books,http://images.amazon.com/images/P/0812881311.0...,http://images.amazon.com/images/P/0812881311.0...,http://images.amazon.com/images/P/0812881311.0...
1031116,276680,0865055041,0,Kitchen (Historic Communities (Paperback)),Bobbie Kalman,1990,Crabtree Publishing Company,http://images.amazon.com/images/P/0865055041.0...,http://images.amazon.com/images/P/0865055041.0...,http://images.amazon.com/images/P/0865055041.0...
1031117,276680,086505505X,0,Home Crafts (Historic Communities (Paperback)),Bobbie Kalman,1990,Crabtree Publishing Company,http://images.amazon.com/images/P/086505505X.0...,http://images.amazon.com/images/P/086505505X.0...,http://images.amazon.com/images/P/086505505X.0...
1031118,276680,1564407284,0,"The Old Sturbridge Village Cookbook, 2nd : Aut...",Caroline Sloat,1995,Globe Pequot,http://images.amazon.com/images/P/1564407284.0...,http://images.amazon.com/images/P/1564407284.0...,http://images.amazon.com/images/P/1564407284.0...


In [72]:
final_ratings = filtered_rating[ filtered_rating['Book-Title'].isin(famous_books)  ]
final_ratings.shape

(263876, 10)

In [77]:
final_ratings.head(2)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...


In [76]:
pt = final_ratings.pivot_table(index='Book-Title',columns='User-ID',values='Book-Rating')

In [77]:
pt.fillna(0,inplace=True)

In [78]:
pt
#np.where(pt.index=="2nd Chance")[0][0]
#print('\n'.join(str(x) for x in pt.index))

User-ID,254,507,882,1424,1435,1733,1903,2033,2110,2276,...,275020,275970,276463,276680,277427,277478,277639,278137,278188,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Q-Space (Star Trek The Next Generation, Book 47)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01-01-00: The Novel of the Millennium,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001 Ways to Be Romantic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"\O\"" Is for Outlaw""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
one hundred years of solitude,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(pt)

In [80]:
similarity_scores.shape
similarity_scores

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.12599988],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.22457581],
       [0.        , 0.        , 0.12599988, ..., 0.        , 0.22457581,
        1.        ]])

In [86]:
def recommend(book_name):
    if book_name not in pt.index:
        return 'Invalid book name'
    # find book index [which row] 
    index = np.where(pt.index==book_name)[0][0]
    
    # return 1d array contain similiraty values with all other books
    scores_for_given_book = similarity_scores[index]
    # Create a list of tuples, where each tuple contains an index and a similarity score for the item at that index
    item_scores = list(enumerate(scores_for_given_book)) #(index,similiarty)
    # Sort the list of tuples in descending order based on the similarity scores
    #x:x[1] => for each tuple (index,simllarity_value)  use 2nd value as key to sort 
    sorted_scores = sorted(item_scores, key=lambda x: x[1], reverse=True)
    
    # Select the 200 items with the highest similarity scores (excluding the item itself)
    top_scores = sorted_scores[1:200] # skip first item as book with it's index is always 1
    #print(top_scores)
    books_title_list = []
    for i in top_scores:
        book_index = i[0] #(book_index,similiarity_value)
        book_name  = pt.index[book_index] # get bookname
        books_title_list.append(book_name)
    return books_title_list

In [87]:
res = recommend("A Time to Kill")
# for r in res:
#     print(r[0])
#res

The Pelican Brief


## Content Filtering Based  Recommender System

In [85]:
import re
# files = ['part0.txt', 'part2.txt', 'part4.txt', 'part6.txt', 'part8.txt', 'part10.txt', 'part12.txt', 'part14.txt', 'part16.txt', 'part18.txt', 'part20.txt', 'part22.txt', 'part24.txt', 'part26.txt', 'part28.txt', 'part30.txt', 'part32.txt', 'part34.txt', 'part36.txt', 'part38.txt', 'part40.txt', 'part42.txt', 'part44.txt', 'part46.txt']
# common_books = pt.index.values.tolist()
# common_books_with_desc = {}

def extract_title_description(input_file):
    with open("output_dir/"+input_file, "r") as f:
        for line in f:
            title = re.search(r'"title":\s*"([^"]+)"', line)
            # Extract description
            description = re.search(r'"description":\s*"([^"]+)"', line)
            if title and description:
                title = title.group(1)
                description = description.group(1)
                if title not in common_books_with_desc and title in common_books:
                    common_books_with_desc[title] = description
                    
#extract_title_description('part10.txt')             
# for file in files :
#     extract_title_description(file)
# import pickle
# # Save the dictionary to a file
# with open("common_books_with_desc.pkl", "wb") as f:
#     pickle.dump(common_books_with_desc, f)

In [86]:
import pickle

common_books_with_desc = pickle.load(open('common_books_with_desc.pkl','rb'))
print(len(common_books_with_desc))
# Remove keys with values less than 10 characters
common_books_with_desc = {k: v for k, v in common_books_with_desc.items() if len(v) >= 10}
print(len(common_books_with_desc))
desc_list = list(common_books_with_desc.values())
title_list= list(common_books_with_desc.keys())
# print(desc_list[351])
# for key in common_books_with_desc.keys():
#     des_list = common_books_with_desc[key]
#     print(common_books_with_desc[key])


5220
4996


In [87]:
print(title_list)



In [88]:
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# nltk.download('stopwords')
# nltk.download('punkt')

# Preprocess the book descriptions by removing stopwords and punctuation
stop_words = set(stopwords.words('english'))
preprocessed_descriptions = []

for description in desc_list:
    tokens = word_tokenize(description.lower())
    filtered_tokens = [token for token in tokens if token not in stop_words and token.isalnum()]
    preprocessed_descriptions.append(" ".join(filtered_tokens))


# Create a TfidfVectorizer object to transform the preprocessed descriptions into a matrix of TF-IDF features
vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(preprocessed_descriptions)
# Calculate the cosine similarity between each pair of book descriptions
similarity_matrix = cosine_similarity(tfidf_matrix)    
print(similarity_matrix)
# feature =  preprocessed_descriptions
# tfidf = text.TfidfVectorizer(stop_words="english")
# tfidf_matrix = tfidf.fit_transform(feature)
# similarity2 = linear_kernel(tfidf_matrix, tfidf_matrix)
# print(similarity2)

[[1.         0.02300052 0.0182906  ... 0.00679929 0.00398115 0.02258059]
 [0.02300052 1.         0.01241738 ... 0.         0.         0.01029011]
 [0.0182906  0.01241738 1.         ... 0.00305571 0.00340659 0.02404887]
 ...
 [0.00679929 0.         0.00305571 ... 1.         0.         0.01695049]
 [0.00398115 0.         0.00340659 ... 0.         1.         0.01953752]
 [0.02258059 0.01029011 0.02404887 ... 0.01695049 0.01953752 1.        ]]


In [89]:
def content_based(title, similarity = similarity_matrix):
    if title not in title_list:
        return 'Invalid book name'
    book_index = np.where(np.array(title_list) == title)[0][0]
    # return 1d array contain similiraty values with all other books
    scores_for_given_book = similarity[book_index] #GET SCORES ROW OF BOOK[I] WITH OTHER BOOKS
    # Create a list of tuples, where each tuple contains an index and a similarity score for the item at that index
    similarity_scores = list(enumerate(scores_for_given_book))
    # Sort the list of tuples in descending order based on the similarity scores
    #x:x[1] => for each tuple (index,simllarity_value)  use 2nd value as key to sort 
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:200]
    #print(similarity_scores)
    #bookindices = [i[0] for i in similarity_scores]
    book_title_list = []
    for i in similarity_scores:
        book_index = i[0] #(book_index,similiarity_value)
        book_name  = title_list[book_index] # get bookname
        book_title_list.append(book_name)
    return book_title_list


In [92]:
from collections import OrderedDict
book_title ="1984"
res1 = content_based(book_title)
res2 = recommend(book_title)

all_books=res1+res2
common = list(OrderedDict.fromkeys([x for x in all_books if all_books.count(x) > 1]))


In [93]:
# print(len(common))  # Output: <length of common>
# print(len(listt1))  # Output: <length of listt1>
# print(len(listt2))  # Output: <length of listt2>
collection = []
collection.extend(res2[:15])
collection.extend(res1[:10])
collection.extend(common[:10])
collection = list(set(collection))
#print(len(collection))  # Output: <length of listt2>
#print(res1)
data =[]
for one_title in collection:
    item=[] 
    temp_df = books[ books['Book-Title'] == one_title ] #get the book record from book.csv 
    temp_df  = temp_df.drop_duplicates('Book-Title') #drop duplicated rows
    item.extend(temp_df['Book-Title'].values)
    item.extend(temp_df['Book-Author'].values)
    item.extend(temp_df['Image-URL-M'].values)
    data.append(item)

In [95]:
print(collection)

["Slaughterhouse Five or the Children's Crusade: A Duty Dance With Death", 'Baltasar and Blimunda', "Life, the Universe, and Everything (Hitchhiker's Trilogy (Paperback))", 'Seven Years in Tibet', 'Madame Bovary (Bantam Classics)', 'Eva Luna', 'The Forever War', 'Bloodstream', 'Long Time Coming', 'Fahrenheit 451', 'Vineland', 'Animal Farm', "Vinegar Hill (Oprah's Book Club (Hardcover))", 'La Sombra del Viento', "Cat's Eye", 'Dark of the Moon', 'What We Keep', 'My Dream of You', 'Trauma', 'S.', 'Waiting', 'Watchmen', 'Far from the Madding Crowd', 'Written on the Body', "The Restaurant at the End of the Universe (Hitchhiker's Trilogy (Paperback))", 'Brave New World', 'Lying Awake']


In [158]:
books.drop_duplicates('Book-Title')

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...
271354,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books,http://images.amazon.com/images/P/0449906736.0...,http://images.amazon.com/images/P/0449906736.0...,http://images.amazon.com/images/P/0449906736.0...
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


In [159]:
import pickle
pickle.dump(popular_df,open('popular.pkl','wb'))
pickle.dump(pt,open('pt.pkl','wb'))
pickle.dump(books,open('books.pkl','wb'))
pickle.dump(similarity_scores,open('similarity_scores.pkl','wb'))