In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('books_new.csv')
df.head()

Unnamed: 0,Title,Author,Genre,SubGenre,Height,Publisher
0,Fundamentals of Wavelets,"Goswami, Jaideva",tech,signal_processing,228,Wiley
1,Data Smart,"Foreman, John",tech,data_science,235,Wiley
2,God Created the Integers,"Hawking, Stephen",tech,mathematics,197,Penguin
3,Superfreakonomics,"Dubner, Stephen",science,economics,179,HarperCollins
4,Orientalism,"Said, Edward",nonfiction,history,197,Penguin


In [3]:
df.shape

(211, 6)

In [4]:
df.isnull().sum()

Title         0
Author       24
Genre         0
SubGenre      0
Height        0
Publisher    96
dtype: int64

In [5]:
df.fillna(df['Author'].mode()[0], inplace = True)
df.fillna(df['Publisher'].mode()[0], inplace = True)

In [6]:
df.isnull().sum()

Title        0
Author       0
Genre        0
SubGenre     0
Height       0
Publisher    0
dtype: int64

In [7]:
df.duplicated().sum()

0

In [8]:
df['tags'] = df['Genre'] + df['SubGenre'] + df['Publisher'] + df['Author']
df

Unnamed: 0,Title,Author,Genre,SubGenre,Height,Publisher,tags
0,Fundamentals of Wavelets,"Goswami, Jaideva",tech,signal_processing,228,Wiley,"techsignal_processingWileyGoswami, Jaideva"
1,Data Smart,"Foreman, John",tech,data_science,235,Wiley,"techdata_scienceWileyForeman, John"
2,God Created the Integers,"Hawking, Stephen",tech,mathematics,197,Penguin,"techmathematicsPenguinHawking, Stephen"
3,Superfreakonomics,"Dubner, Stephen",science,economics,179,HarperCollins,"scienceeconomicsHarperCollinsDubner, Stephen"
4,Orientalism,"Said, Edward",nonfiction,history,197,Penguin,"nonfictionhistoryPenguinSaid, Edward"
...,...,...,...,...,...,...,...
206,Structure and Randomness,"Tao, Terence",science,mathematics,252,"Steinbeck, John","sciencemathematicsSteinbeck, JohnTao, Terence"
207,Image Processing with MATLAB,"Eddins, Steve",tech,signal_processing,241,"Steinbeck, John","techsignal_processingSteinbeck, JohnEddins, Steve"
208,Animal Farm,"Orwell, George",fiction,classic,180,"Steinbeck, John","fictionclassicSteinbeck, JohnOrwell, George"
209,"Idiot, The","Dostoevsky, Fyodor",fiction,classic,197,"Steinbeck, John","fictionclassicSteinbeck, JohnDostoevsky, Fyodor"


In [9]:
df.drop(columns=['Author','Genre','SubGenre','Height','Publisher'], inplace=True)
df.head()

Unnamed: 0,Title,tags
0,Fundamentals of Wavelets,"techsignal_processingWileyGoswami, Jaideva"
1,Data Smart,"techdata_scienceWileyForeman, John"
2,God Created the Integers,"techmathematicsPenguinHawking, Stephen"
3,Superfreakonomics,"scienceeconomicsHarperCollinsDubner, Stephen"
4,Orientalism,"nonfictionhistoryPenguinSaid, Edward"


In [10]:
df.shape

(211, 2)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=211,stop_words='english')

In [12]:
vector = cv.fit_transform(df['tags']).toarray()

In [13]:
vector.shape

(211, 211)

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
similarity = cosine_similarity(vector)

In [16]:
similarity[0:3]

array([[1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [17]:
similarity

array([[1. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 1. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 1. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 1. , 0.5, 0.5],
       [0. , 0. , 0. , ..., 0.5, 1. , 0.5],
       [0. , 0. , 0. , ..., 0.5, 0.5, 1. ]])

In [18]:
df[df['Title'] == 'God Created the Integers'].index[0]

2

In [19]:
def recommend(book):
    index = df[df['Title'] == book].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(df.iloc[i[0]].Title)

In [20]:
recommend('God Created the Integers')

Superfreakonomics
Freakonomics
Theory of Everything, The
Fundamentals of Wavelets
Data Smart


In [21]:
recommend('Orientalism')

Fundamentals of Wavelets
Data Smart
God Created the Integers
Superfreakonomics
Nature of Statistical Learning Theory, The


In [22]:
recommend('Let Us C')

Pointers in C
Data Structures Using C & C++
Fundamentals of Wavelets
Data Smart
God Created the Integers


In [25]:
recommend('Data Structures Using C & C++')

Let Us C
Fundamentals of Wavelets
Data Smart
God Created the Integers
Superfreakonomics


In [26]:
recommend('Data Structures Using C & C++')

Let Us C
Fundamentals of Wavelets
Data Smart
God Created the Integers
Superfreakonomics


In [23]:
import pickle

In [24]:
pickle.dump(df,open('book_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))