In [70]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
from collections import Counter
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

In [71]:
books = pd.read_csv(r"Books.csv", delimiter=';', error_bad_lines=False, encoding='ISO-8859-1', warn_bad_lines=False)
ratings = pd.read_csv(r"Book-Ratings.csv", delimiter=';', error_bad_lines=False, encoding='ISO-8859-1', warn_bad_lines=False)


In [72]:
print(" Book Data:  ",books.shape)

 Book Data:   (271360, 8)


In [73]:
print(" Columns : " , list(books.columns))

 Columns :  ['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L']


In [74]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [75]:
books.drop(['Image-URL-S','Image-URL-M','Image-URL-L'], axis=1,inplace=True)
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [76]:
## checking for null values
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
dtype: int64

In [77]:
books.loc[books['Book-Author'].isnull(),:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
187689,9627982032,The Credit Suisse Guide to Managing Your Perso...,,1995,Edinburgh Financial Publishing


In [78]:
books.loc[books['Publisher'].isnull(),:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
128890,193169656X,Tyrant Moon,Elaine Corvidae,2002,
129037,1931696993,Finders Keepers,Linnea Sinclair,2001,


In [79]:
books.at[187689, 'Book-Author'] = 'Other'

books.at[128890, 'Publisher'] = 'Other'
books.at[129037,'Publisher'] = 'Other'

In [80]:
## Checking the year of publication
books['Year-Of-Publication'].unique()

array([2002, 2001, 1991, 1999, 2000, 1993, 1996, 1988, 2004, 1998, 1994,
       2003, 1997, 1983, 1979, 1995, 1982, 1985, 1992, 1986, 1978, 1980,
       1952, 1987, 1990, 1981, 1989, 1984, 0, 1968, 1961, 1958, 1974,
       1976, 1971, 1977, 1975, 1965, 1941, 1970, 1962, 1973, 1972, 1960,
       1966, 1920, 1956, 1959, 1953, 1951, 1942, 1963, 1964, 1969, 1954,
       1950, 1967, 2005, 1957, 1940, 1937, 1955, 1946, 1936, 1930, 2011,
       1925, 1948, 1943, 1947, 1945, 1923, 2020, 1939, 1926, 1938, 2030,
       1911, 1904, 1949, 1932, 1928, 1929, 1927, 1931, 1914, 2050, 1934,
       1910, 1933, 1902, 1924, 1921, 1900, 2038, 2026, 1944, 1917, 1901,
       2010, 1908, 1906, 1935, 1806, 2021, '2000', '1995', '1999', '2004',
       '2003', '1990', '1994', '1986', '1989', '2002', '1981', '1993',
       '1983', '1982', '1976', '1991', '1977', '1998', '1992', '1996',
       '0', '1997', '2001', '1974', '1968', '1987', '1984', '1988',
       '1963', '1956', '1970', '1985', '1978', '1973', '1980'

In [81]:
books.loc[books['Year-Of-Publication'] == 'DK Publishing Inc',:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
209538,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.0...
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Book...",2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.0...


In [82]:
books.loc[books['Year-Of-Publication'] == 'Gallimard',:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
220731,2070426769,"Peuple du ciel, suivi de 'Les Bergers\"";Jean-M...",2003,Gallimard,http://images.amazon.com/images/P/2070426769.0...


In [83]:
books.at[209538 , 'Publisher'] = 'DK Publishing Inc'
books.at[209538 , 'Year-Of-Publication'] = 2000
books.at[209538 , 'Book-Author'] = 'Michael Teitbaum'

books.at[221678 ,'Publisher'] = 'DK Publishing Inc'
books.at[221678 ,'Year-Of-Publication'] = 2000
books.at[209538 ,'Book-Author'] = 'James Buckley'

books.at[220731 ,'Publisher'] = 'Gallimard'
books.at[220731 ,'Year-Of-Publication'] = '2003'
books.at[209538 ,'Book-Author'] = 'Jean-Marie Gustave Le ClÃ?Â©zio'


In [84]:
## Converting year of publication in Numbers
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(int)

In [85]:
print(sorted(list(books['Year-Of-Publication'].unique())))


[0, 1376, 1378, 1806, 1897, 1900, 1901, 1902, 1904, 1906, 1908, 1909, 1910, 1911, 1914, 1917, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2008, 2010, 2011, 2012, 2020, 2021, 2024, 2026, 2030, 2037, 2038, 2050]


In [86]:
## most repeated year
books['Year-Of-Publication'].mode()

0    2002
Name: Year-Of-Publication, dtype: int64

In [87]:
books.loc[books['Year-Of-Publication'] > 2023, 'Year-Of-Publication'] = 2002
books.loc[books['Year-Of-Publication'] == 0, 'Year-Of-Publication'] = 2002

In [88]:
## Uppercasing all alphabets in ISBN
books['ISBN'] = books['ISBN'].str.upper()

In [89]:
## Drop duplicate rows
books.drop_duplicates(keep='last', inplace=True)
books.reset_index(drop = True, inplace = True)

In [90]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271047 entries, 0 to 271046
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271047 non-null  object
 1   Book-Title           271047 non-null  object
 2   Book-Author          271047 non-null  object
 3   Year-Of-Publication  271047 non-null  int64 
 4   Publisher            271047 non-null  object
dtypes: int64(1), object(4)
memory usage: 10.3+ MB


In [91]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [92]:
print("Columns: ", list(ratings.columns))
ratings.head()

Columns:  ['User-ID', 'ISBN', 'Book-Rating']


Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [93]:
## Checking for null values
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [94]:
ratings.loc[ratings["ISBN"].isnull(),:]

Unnamed: 0,User-ID,ISBN,Book-Rating


In [95]:
ratings.dropna(inplace=True)

In [96]:
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [97]:
## checking ISBN
flag = 0
k =[]
reg = "[^A-Za-z0-9]"

for x in ratings['ISBN']:
    z = re.search(reg,x)
    if z:
        flag = 1

if flag == 1:
    print("False")
else:
    print("True")

False


In [98]:
## removing extra characters from ISBN (from ratings dataset) existing in books dataset
bookISBN = books['ISBN'].tolist()
reg = "[^A-Za-z0-9]"
for index, row_Value in ratings.iterrows():
    z = re.search(reg, row_Value['ISBN'])
    if z:
        f = re.sub(reg,"",row_Value['ISBN'])
        if f in bookISBN:
            ratings.at[index , 'ISBN'] = f

In [99]:
## Uppercasing all alphabets in ISBN
ratings['ISBN'] = ratings['ISBN'].str.upper()

In [100]:
## Drop duplicate rows
ratings.drop_duplicates(keep='last', inplace=True)
ratings.reset_index(drop=True, inplace=True)

In [101]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149777 entries, 0 to 1149776
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149777 non-null  int64 
 1   ISBN         1149777 non-null  object
 2   Book-Rating  1149777 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


## Merging the books and ratings dataset

In [102]:
dataset = pd.merge(books, ratings, on='ISBN', how='inner')

In [103]:

dataset1 = dataset[dataset['Book-Rating'] != 0]
dataset1 = dataset1.reset_index(drop = True)
dataset1.shape

(384074, 7)

In [104]:
dataset2 = dataset[dataset['Book-Rating'] == 0]
dataset2 = dataset2.reset_index(drop = True)
dataset2.shape

(647535, 7)

In [146]:
dataset1.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating
0,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676,8
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,67544,8
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,116866,9
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,123629,9


In [147]:
dataset2.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,content,final
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,2,0,Classical Mythology Mark P. O. Morford Oxford ...,classical mythology mark morford oxford univer...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11400,0,Clara Callan Richard Bruce Wright HarperFlamin...,clara callan richard bruce wright harperflamin...
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,41385,0,Clara Callan Richard Bruce Wright HarperFlamin...,clara callan richard bruce wright harperflamin...
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,85526,0,Clara Callan Richard Bruce Wright HarperFlamin...,clara callan richard bruce wright harperflamin...
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,96054,0,Clara Callan Richard Bruce Wright HarperFlamin...,clara callan richard bruce wright harperflamin...


# Recommendation System

In [148]:
bookName = input("Enter a book name: ")
number = int(input("Enter number of books to recommend: "))

#Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))

Enter a book name: Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))
Enter number of books to recommend: 5


##  Popularity Based (Top In whole collection)

In [149]:
def popularity_based(dataframe, n):
     if n >= 1 and n <= len(dataframe):
        data = pd.DataFrame(dataframe.groupby('ISBN')['Book-Rating'].count()).sort_values('Book-Rating', ascending=False).head(n)
        result = pd.merge(data, books, on='ISBN')
        return result
     return "Invalid number of books entered!!"

In [150]:
print("Top", number, "Popular books are: ")
popularity_based(dataset1,number)

Top 5 Popular books are: 


Unnamed: 0,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,316666343,707,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown"
1,971880107,581,Wild Animus,Rich Shapero,2004,Too Far
2,385504209,488,The Da Vinci Code,Dan Brown,2003,Doubleday
3,312195516,383,The Red Tent (Bestselling Backlist),Anita Diamant,1998,Picador USA
4,60928336,320,Divine Secrets of the Ya-Ya Sisterhood: A Novel,Rebecca Wells,1997,Perennial


## Books by same author, publisher of given book name

In [151]:
def printBook(k, n):
    z = k['Book-Title'].unique()
    for x in range(len(z)):
        print(z[x])
        if x >= n-1:
            break

In [154]:
printBook(dataset,5)

Classical Mythology
Clara Callan
Decision in Normandy
Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It
The Mummies of Urumchi


In [155]:
def get_books(dataframe, name, n):
    print("\nBooks by same Author:\n")
    au = dataframe['Book-Author'].unique()

    data = dataset1[dataset1['Book-Title'] != name]

    if au[0] in list(data['Book-Author'].unique()):
        k2 = data[data['Book-Author'] == au[0]]
    k2 = k2.sort_values(by=['Book-Rating'])
    printBook(k2, n)

    print("\n\nBooks by same Publisher:\n")
    au = dataframe['Publisher'].unique()

    if au[0] in list(data['Publisher'].unique()):
        k2 = pd.DataFrame(data[data['Publisher'] == au[0]])
    k2=k2.sort_values(by=['Book-Rating'])
    printBook(k2, n)

In [156]:
if bookName in list(dataset1['Book-Title'].unique()):
    d = dataset1[dataset1['Book-Title'] == bookName]
    get_books(d, bookName, number)
else:
    print("Invalid Book Name!!")


Books by same Author:

Harry Potter and the Goblet of Fire (Book 4)
Harry Potter and the Chamber of Secrets (Book 2)
Harry Potter y el cÃ¡liz de fuego
Harry Potter and the Order of the Phoenix (Book 5)
Harry Potter and the Prisoner of Azkaban (Book 3)


Books by same Publisher:

The Seeing Stone
The Slightly True Story of Cedar B. Hartley: Who Planned to Live an Unusual Life
The Story of the Seagull and the Cat Who Taught Her To Fly
Harry Potter and the Chamber of Secrets (Harry Potter)
The Mouse and His Child


## Recommendation system for Dataset2

In [157]:
dataset2['content'] = dataset2['Book-Title'] +' '+dataset2['Book-Author'] + ' ' + dataset2['Publisher']


In [158]:
# defining own Stopwords with the help of stopwords in nltk
stop = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',    'yourself', 'more', 'if', 'on', 'don', "don't", 'once', 'this', 'being', 'as', 'there', 'should', 'its', 'been', 'didn', "you're", 'me', 'but', 'than', 'just', 'an', 'when', 'after', 'now', 'your', "needn't", 'aren', 'that', 'during', 've', 'yourselves', "you'll", 'who', 'whom', "hadn't", "weren't", "shouldn't", 'at', 'the', 'can', 'herself', 'from', 'those', "she's", 'hers', 'so', 'for', 'll', 'do', "didn't", 't', 's', 'to', 'wouldn', "hasn't", 'before', "shan't", 'too', "mightn't", 'above', 'most', 'him', 'theirs', 'has', 'she', 'i', 'here', 'be', 'because', 'd', 'y', "couldn't", 'doesn', "wouldn't", 'ma', 'all', 'doing', 'himself', 'are', 'o', "doesn't", 'what', 'my', 'up', "you've", 'nor', 'couldn', 'ours', 'his', 'themselves', 'which', 'in', 'having', "isn't", 'while', 'shan', 'below', 'ain', "won't", 'these', 'needn', "it's", 'why', 'were', 'mightn', 'won', 'itself', 'mustn', 'was', 'against', 'of', 'then', 'both', "wasn't", 'by', 'or', 'myself', 'a', 'only', 'we', 'down', 'no', 'between', 'some', 'hadn', 'where', 'until', 'other', 'did', 'they', 'have', "haven't", 'further', 'you', 'had', 'yours', 'through', 'same', "should've", 'he', 'off', 'will', 'few', 'ourselves', 'how', "aren't", 'wasn', 'our', 'isn', 'them', 'into', "that'll", "mustn't", 'such', 'their', 'hasn', 'm', 'weren', 'about', 'under', 'it', 'does', 'not', 'haven', "you'd", 'over', 'out', 'any', 'and', 'each', 'very', 'her', 'with', 'own', 're', 'shouldn', 'am', 'again', 'is']


In [159]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [160]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [161]:
lemmatizer = WordNetLemmatizer()
clear=[]
for i in range(dataset2.shape[0]):
    ind= dataset2.loc[i,['content']][0]
    ind = ' '.join(re.findall(r'\w+|\d+',ind))
    ind = ind.lower()
    ind = word_tokenize(ind)
    ind = [lemmatizer.lemmatize(word) for word in ind if word not in stop]
    clear.append(' '.join(ind))

In [162]:
dataset2['final'] = clear

In [163]:
countvector = CountVectorizer(ngram_range=(1,1))
books_bow = countvector.fit_transform(dataset2['final'])
tfidf = TfidfVectorizer(ngram_range=(1,1))
books_tfidf = tfidf.fit_transform(dataset2['final'])
def preprocess(user_text):
    out= []
    user_text = [user_text]
    for i in user_text:
        ind = ' '.join(re.findall(r'\w+|\d+',i))
        ind = ind.lower()
        ind = word_tokenize(ind)
        ind = [lemmatizer.lemmatize(word) for word in ind if word not in stop]
        out.append(" ".join(ind))
    return out

In [164]:
def recommend(user_text):
    ind = preprocess(user_text)
    ind1 = preprocess(user_text)
    x = pd.Series(cosine_similarity(books_tfidf,countvector.transform(ind)).
               flatten()).nlargest(5).sort_values(ascending=False)
    ind= x.index.tolist()
    ind = dataset2.loc[ind,['ISBN','Book-Title', 'Book-Author', 'Publisher',
       'Book-Rating']]
    ind['score'] = pd.Series(cosine_similarity(books_tfidf,tfidf.transform(ind1)).
               flatten()).nlargest(5).sort_values(ascending=False).values.tolist()
    return ind

In [165]:
def recommendBIG(user_text):
    ind = preprocess(user_text)
    x = pd.Series(cosine_similarity(books_tfidf,countvector.transform(ind)).
               flatten()).nlargest(5).sort_values(ascending=False)
    top_index = x.index.tolist()[0]
    top_score = pd.Series(cosine_similarity(books_tfidf,countvector.transform(ind)).
                          flatten()).nlargest(5).sort_values(ascending=False).values.tolist()[0]
    if top_score == 0:
        return print('No Match Found')

    else:
        input_2 = dataset2.loc[top_index,['final']][0]
        ind = recommend(input_2)
        return ind

In [166]:
recommendBIG('christianity religion')['Book-Title'].values.tolist()


['Religion in Society: A Sociology of Religion',
 'Religion &amp; the Decline of Magic',
 'Philosophy of Religion (4th Edition)',
 'Philosophy of Religion (4th Edition)',
 'Women and World Religions, Second Edition']

In [167]:
 recommendBIG('christianity religion')['Book-Author'].values.tolist()

['Ronald L. Johnstone',
 'Keith Thomas',
 'John H. Hick',
 'John H. Hick',
 'Denise Carmody']

In [168]:
recommendBIG('o')

No Match Found


In [169]:
import pickle
pickle.dump(books_bow,open('books_bow.pkl','wb'))
pickle.dump(books_tfidf,open('books_tfidf.pkl','wb'))

In [174]:
pickle.dump(dataset1,open('dataset1.pkl','wb'))
pickle.dump(dataset2,open('dataset2.pkl','wb'))
