In [1]:
import pandas as pd                                                                                                                     # data processing
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity 
import pickle

In [2]:
books = pd.read_csv('GoodReads_35k_books.csv',low_memory=False) 

In [3]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13270 entries, 0 to 13269
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    13270 non-null  int64  
 1   author        13270 non-null  object 
 2   desc          13270 non-null  object 
 3   genre         13270 non-null  object 
 4   img           13270 non-null  object 
 5   isbn          13270 non-null  object 
 6   link          13270 non-null  object 
 7   pages         13270 non-null  int64  
 8   rating        13270 non-null  float64
 9   reviews       13270 non-null  int64  
 10  title         13270 non-null  object 
 11  totalratings  13270 non-null  int64  
dtypes: float64(1), int64(4), object(7)
memory usage: 1.2+ MB


In [4]:
books.shape 

(13270, 12)

In [5]:
books.head(0)

Unnamed: 0.1,Unnamed: 0,author,desc,genre,img,isbn,link,pages,rating,reviews,title,totalratings


In [6]:
books = books[['author','desc','genre','img','isbn','link','pages','rating','reviews','title','totalratings']]
books.head(0)

Unnamed: 0,author,desc,genre,img,isbn,link,pages,rating,reviews,title,totalratings


In [7]:
books.duplicated().sum() #checking if any field is duplicated and how many are duplicated

0

In [8]:
books.isnull().sum()    #checking if any field is empty and how many are empty 

author          0
desc            0
genre           0
img             0
isbn            0
link            0
pages           0
rating          0
reviews         0
title           0
totalratings    0
dtype: int64

In [9]:
books.dropna(inplace=True)          #drops the not available field
books.drop_duplicates(inplace=True) #drops the duplicated field 
#inplace means make change in the original data frame

In [10]:
books.to_csv('filtered_book.csv') #save the preprocessed database
books.iloc[0]

author                                             Gladys M. Hunt
desc            Family favorite now revised and updated, inclu...
genre           Parenting,Education,Nonfiction,Writing,Books A...
img             https://i.gr-assets.com/images/S/compressed.ph...
isbn                                                    310242460
link            https://goodreads.com/book/show/126830.Honey_f...
pages                                                         251
rating                                                       4.44
reviews                                                       341
title           Honey for a Child's Heart: The Imaginative Use...
totalratings                                                 3814
Name: 0, dtype: object

In [11]:
books.head(1)

Unnamed: 0,author,desc,genre,img,isbn,link,pages,rating,reviews,title,totalratings
0,Gladys M. Hunt,"Family favorite now revised and updated, inclu...","Parenting,Education,Nonfiction,Writing,Books A...",https://i.gr-assets.com/images/S/compressed.ph...,310242460,https://goodreads.com/book/show/126830.Honey_f...,251,4.44,341,Honey for a Child's Heart: The Imaginative Use...,3814


In [12]:
# books['desc'][0]

In [13]:
books['genre']=books['genre'].apply(lambda x:x.split(','))
books['author']=books['author'].apply(lambda x:x.split(','))
books['desc']=books['desc'].apply(lambda x:x.split(' '))
books.head(1)

Unnamed: 0,author,desc,genre,img,isbn,link,pages,rating,reviews,title,totalratings
0,[Gladys M. Hunt],"[Family, favorite, now, revised, and, updated,...","[Parenting, Education, Nonfiction, Writing, Bo...",https://i.gr-assets.com/images/S/compressed.ph...,310242460,https://goodreads.com/book/show/126830.Honey_f...,251,4.44,341,Honey for a Child's Heart: The Imaginative Use...,3814


In [14]:
books['genre']=books['genre'].apply(lambda x:[i.replace(" ","")for i in x])
books['author']=books['author'].apply(lambda x:[i.replace(" ","")for i in x])
books['desc']=books['desc'].apply(lambda x:[i.replace(" ","")for i in x])
#will remove all the spaces in between the words to remove the tag ambigiuty (sam alterman sam ross me sam comman hai to problem ayegi)
books.head(1)

Unnamed: 0,author,desc,genre,img,isbn,link,pages,rating,reviews,title,totalratings
0,[GladysM.Hunt],"[Family, favorite, now, revised, and, updated,...","[Parenting, Education, Nonfiction, Writing, Bo...",https://i.gr-assets.com/images/S/compressed.ph...,310242460,https://goodreads.com/book/show/126830.Honey_f...,251,4.44,341,Honey for a Child's Heart: The Imaginative Use...,3814


In [15]:
books['tags'] = books['author'] + books['genre'] + books['desc'] 
books.head(1)['tags'].values

array([list(['GladysM.Hunt', 'Parenting', 'Education', 'Nonfiction', 'Writing', 'BooksAboutBooks', 'Reference', 'Christian', 'Childrens', 'Family', 'Literature', 'Teaching', 'Family', 'favorite', 'now', 'revised', 'and', 'updated,', 'including', 'an', 'annotated', 'list', 'of', 'books', 'for', 'ages', '0-12,Everything', 'parents', 'need', 'to', 'know', 'to', 'find', 'the', 'best', 'books', 'for', 'their', 'children,Since', 'its', 'publication', 'in', '1969,', 'this', 'has', 'been', 'an', 'essential', 'guide', 'for', 'parents', 'wanting', 'to', 'find', 'the', 'best', 'books', 'for', 'their', 'children.', 'Now', 'in', 'its', 'fourth', 'edition,', ',Honey', 'for', 'a', 'Childâ€™s', 'Heart,', 'discusses', 'everything', 'from', 'the', 'ways', 'reading', 'affects', 'both', 'childrenâ€™s', 'view', 'of', 'the', 'world', 'and', 'their', 'imagination', 'to', 'how', 'to', 'choose', 'good', 'books.', 'Illustrated', 'with', 'drawings', 'from', 'dozens', 'of', 'favorites,', 'it', 'includes', 'an', '

In [16]:
new_df = books[['author','desc','genre','img','isbn','link','pages','rating','reviews','title','totalratings','tags']]
# new_df = books[['title','author','isbn','tags']]
new_df.head(1)

Unnamed: 0,author,desc,genre,img,isbn,link,pages,rating,reviews,title,totalratings,tags
0,[GladysM.Hunt],"[Family, favorite, now, revised, and, updated,...","[Parenting, Education, Nonfiction, Writing, Bo...",https://i.gr-assets.com/images/S/compressed.ph...,310242460,https://goodreads.com/book/show/126830.Honey_f...,251,4.44,341,Honey for a Child's Heart: The Imaginative Use...,3814,"[GladysM.Hunt, Parenting, Education, Nonfictio..."


In [17]:
# new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x)) #use this once warna multispace aa jayenge 
# new_df['tags']=new_df['tags'].apply(lambda x:x.lower()) #use this once warna multispace aa jayenge 
#error aari thi to net se naya syntax nikala hai V ye wali 

new_df.loc[:, 'tags'] = new_df['tags'].apply(lambda x: " ".join(x))
new_df.loc[:, 'tags'] = new_df['tags'].apply(lambda x: x.lower())
new_df.head(1)

Unnamed: 0,author,desc,genre,img,isbn,link,pages,rating,reviews,title,totalratings,tags
0,[GladysM.Hunt],"[Family, favorite, now, revised, and, updated,...","[Parenting, Education, Nonfiction, Writing, Bo...",https://i.gr-assets.com/images/S/compressed.ph...,310242460,https://goodreads.com/book/show/126830.Honey_f...,251,4.44,341,Honey for a Child's Heart: The Imaginative Use...,3814,gladysm.hunt parenting education nonfiction wr...


In [18]:
new_df.iloc[0].tags

'gladysm.hunt parenting education nonfiction writing booksaboutbooks reference christian childrens family literature teaching family favorite now revised and updated, including an annotated list of books for ages 0-12,everything parents need to know to find the best books for their children,since its publication in 1969, this has been an essential guide for parents wanting to find the best books for their children. now in its fourth edition, ,honey for a childâ€™s heart, discusses everything from the ways reading affects both childrenâ€™s view of the world and their imagination to how to choose good books. illustrated with drawings from dozens of favorites, it includes an indexed and updated list of the best new books on the market and the classics that you want your children to enjoy. author gladys huntâ€™s tastes are broad, her advice is rooted in experience, and her suggestions will enrich the cultural and spiritual life of any home.'

In [19]:
def stem(text):
    y = []
    for i in text.split():
        y.append(PorterStemmer().stem(i))
    
    return " ".join(y)

In [20]:
# new_df['tags'].apply(stem)
new_df.loc[:, 'tags'] = new_df['tags'].apply(stem)

In [21]:
new_df.iloc[0].tags

'gladysm.hunt parent educ nonfict write booksaboutbook refer christian children famili literatur teach famili favorit now revis and updated, includ an annot list of book for age 0-12,everyth parent need to know to find the best book for their children,sinc it public in 1969, thi ha been an essenti guid for parent want to find the best book for their children. now in it fourth edition, ,honey for a childâ€™ heart, discuss everyth from the way read affect both childrenâ€™ view of the world and their imagin to how to choos good books. illustr with draw from dozen of favorites, it includ an index and updat list of the best new book on the market and the classic that you want your children to enjoy. author gladi huntâ€™ tast are broad, her advic is root in experience, and her suggest will enrich the cultur and spiritu life of ani home.'

In [22]:
cv = CountVectorizer(max_features=5000,stop_words='english')

In [23]:
vector = cv.fit_transform(new_df['tags']).toarray()
vector[1].sum()
vector.shape

(13270, 5000)

In [24]:
similarity = cosine_similarity(vector)
similarity.shape

(13270, 13270)

In [25]:
# similarity

In [26]:
# new_df[new_df['title'] == 'Seductive Fantasy'].index[0]

In [27]:
pickle.dump(new_df,open('book_list.pkl','wb'))
pickle.dump(similarity,open('similarity_matrix.pkl','wb'))
# pickle.dump(books,open('original_book_list.pkl','wb'))

In [28]:
# books.iloc[4[0]].img
# books.head(1)

In [29]:
# books.iloc[6].index[0]
books[books['title'] == 'Grandpa Green'].index

Index([11017], dtype='int64')