In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

**Load the data from csv files**

In [None]:
books = pd.read_csv('../input/books.csv', encoding = "ISO-8859-1")
books.head()

In [None]:
books.shape

In [None]:
books.columns

In [None]:
ratings = pd.read_csv('../input/ratings.csv', encoding = "ISO-8859-1")
ratings.head()

In [None]:
book_tags = pd.read_csv('../input/book_tags.csv', encoding = "ISO-8859-1")
book_tags.head()

In [None]:
tags = pd.read_csv('../input/tags.csv')
tags.tail()

In [None]:
tags_join_DF = pd.merge(book_tags, tags, left_on='tag_id', right_on='tag_id', how='inner')
tags_join_DF.head()

In [None]:
to_read = pd.read_csv('../input/to_read.csv')
to_read.head()

**TfidfVectorizer** function from scikit-learn, which transforms** text to feature vectors** that can be used as input to estimator.

 **Cosine Similarity** to calculate a numeric value that denotes the similarity between two books.

In [None]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(books['authors'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim

A function that returns the 20 most similar books based on the cosine similarity score.

In [None]:
# Build a 1-dimensional array with book titles
titles = books['title']
indices = pd.Series(books.index, index=books['title'])

# Function that get book recommendations based on the cosine similarity score of book authors
def authors_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

In [None]:
authors_recommendations('The Hobbit').head(20)

Recommend books using the tags provided to the books.

In [None]:
books_with_tags = pd.merge(books, tags_join_DF, left_on='book_id', right_on='goodreads_book_id', how='inner')

In [None]:
# books_with_tags[(books_with_tags.goodreads_book_id==18710190)]['tag_name']

In [None]:
tf1 = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix1 = tf1.fit_transform(books_with_tags['tag_name'].head(10000))
cosine_sim1 = linear_kernel(tfidf_matrix1, tfidf_matrix1)

In [None]:
cosine_sim1

In [None]:
# Build a 1-dimensional array with book titles
titles1 = books['title']
indices1 = pd.Series(books.index, index=books['title'])

# Function that get book recommendations based on the cosine similarity score of books tags
def tags_recommendations(title):
    idx = indices1[title]
    sim_scores = list(enumerate(cosine_sim1[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

In [None]:
tags_recommendations('The Hobbit').head(20)

Recommendation of books using the authors and tags attributes for better results.
Creating corpus of features and calculating the TF-IDF on the corpus of attributes for gettings better recommendations.

In [None]:
temp_df = books_with_tags.groupby('book_id')['tag_name'].apply(' '.join).reset_index()
temp_df.head()

In [None]:
books = pd.merge(books, temp_df, left_on='book_id', right_on='book_id', how='inner')

In [None]:
books.head()

In [None]:
books['corpus'] = (pd.Series(books[['authors', 'tag_name']]
                .fillna('')
                .values.tolist()
                ).str.join(' '))

In [None]:
tf_corpus = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix_corpus = tf_corpus.fit_transform(books['corpus'])
cosine_sim_corpus = linear_kernel(tfidf_matrix_corpus, tfidf_matrix_corpus)

# Build a 1-dimensional array with book titles
titles = books['title']
indices = pd.Series(books.index, index=books['title'])

# Function that get book recommendations based on the cosine similarity score of books tags
def corpus_recommendations(title):
    idx = indices1[title]
    sim_scores = list(enumerate(cosine_sim_corpus[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    book_indices = [i[0] for i in sim_scores]
    return titles.iloc[book_indices]

corpus_recommendations("The Hobbit")

In [None]:
corpus_recommendations("Twilight (Twilight, #1)")

In [None]:
corpus_recommendations("Romeo and Juliet")