In [1]:
import numpy as np 
import pandas as pd 
import re
import string
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [3]:
#goodreads data
books_data = pd.read_csv('books.csv',error_bad_lines = False)
tags_data = pd.read_csv('book_tags.csv')
ratings_data = pd.read_csv('ratings.csv')
book_tags = pd.read_csv('tags.csv')

# book crossing data
user_cols = ['user_id', 'location', 'age']
cross_users_data = pd.read_csv('BX-Users.csv', sep=';', names=user_cols, encoding='latin-1', low_memory=False, skiprows=1)
book_cols = ['isbn', 'book_title' ,'book_author','year_of_publication', 'publisher', 'img_s', 'img_m', 'img_l']
cross_books_data = pd.read_csv('BX_Books.csv', sep=';', names=book_cols, encoding='latin-1', low_memory=False, skiprows=1)
rating_cols = ['user_id', 'isbn', 'rating']
cross_ratings_data = pd.read_csv('BX-Book-Ratings.csv', sep=';', names=rating_cols, encoding='latin-1', low_memory=False, skiprows=1)

In [4]:
books_data.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [5]:
cross_books_data.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_s,img_m,img_l
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [6]:
books_data = books_data.drop(columns=['id', 'best_book_id', 'work_id', 'isbn', 'isbn13', 'title','work_ratings_count',
                                   'work_text_reviews_count', 'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 
                                    'image_url','small_image_url'])

In [7]:
#drop unnecessary data
books_data = books_data.dropna()
cross_books_data = cross_books_data.drop(columns=['img_s', 'img_m', 'img_l'])

In [8]:
#Drop Duplicates from all the dataset
ratings_data = ratings_data.sort_values("user_id")
ratings_data.drop_duplicates(subset =["user_id","book_id"], keep = False, inplace = True) 
books_data.drop_duplicates(subset='original_title',keep=False,inplace=True)
book_tags.drop_duplicates(subset='tag_id',keep=False,inplace=True)
tags_data.drop_duplicates(subset=['tag_id','goodreads_book_id'],keep=False,inplace=True)
cross_ratings_data.drop_duplicates(subset =["user_id","isbn"], keep = False, inplace = True) 
cross_books_data.drop_duplicates(subset='book_title',keep=False,inplace=True)

In [9]:
#clean the text
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [10]:
cross_books_data['book_title'] = cross_books_data['book_title'].apply(lambda x:clean_text(x))

In [11]:
merge_data = pd.merge(cross_books_data, cross_ratings_data, on='isbn')
merge_data =  merge_data.sort_values('isbn', ascending=True)
merge_data.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,user_id,rating
626418,0000913154,the way things work an illustrated encyclopedi...,C. van Amerongen (translator),1967,Simon & Schuster,171118,8
587195,0001010565,mogs christmas,Judith Kerr,1992,Collins,209516,0
587194,0001010565,mogs christmas,Judith Kerr,1992,Collins,86123,0
441049,0001046713,twopence to cross the mersey,Helen Forrester,1992,HarperCollins Publishers,196149,0
263949,000104687X,ts eliot reading the wasteland and other poems,T.S. Eliot,1993,HarperCollins Publishers,23902,6


Фильтрация на основе содержания

In [12]:
content_data = books_data[['original_title','authors','average_rating']]
content_data = content_data.astype(str)

In [13]:
content_data['content'] = content_data['original_title'] + ' ' + content_data['authors'] + ' ' + content_data['average_rating']

In [14]:
content_data = content_data.reset_index()
indices = pd.Series(content_data.index, index=content_data['original_title'])

 content based recommodation author

In [18]:
#removing stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(content_data['authors'])
tfidf_matrix.shape

(8175, 5484)

In [19]:
#Compute the cosine similarity matrix
cosine_sim_author = linear_kernel(tfidf_matrix, tfidf_matrix)

In [20]:
#author wise recommodation
def get_recommendations_books(title, cosine_sim=cosine_sim_author):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_author[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return list(content_data['original_title'].iloc[book_indices])

In [21]:
def author_book_shows(book):
    for book in book:
        print(book)

In [22]:
books1 = get_recommendations_books('The Hobbit', cosine_sim_author)
author_book_shows(books1)

The Hobbit or There and Back Again
 The Fellowship of the Ring
The Two Towers
The Return of the King
The Lord of the Rings
The Hobbit and The Lord of the Rings
Nikola Tesla: Imagination and the Man That Invented the 20th Century
The Children of Húrin
Entwined
The 7 Habits Of Highly Effective Teens


In [23]:
books2 =get_recommendations_books('Shadow Kiss', cosine_sim_author)
author_book_shows(books2)

Shadow Kiss
Spirit Bound
Blood Promise
Last Sacrifice 
Bloodlines
The Golden Lily
The Indigo Spell
The Fiery Heart
Succubus Blues
Silver Shadows


Коллаборативная фильтрация

In [24]:
merge_data = merge_data[:40000]

In [25]:
book_rating = pd.pivot_table(merge_data, index='user_id', values='rating', columns='book_title', fill_value=0)
book_rating

book_title,Unnamed: 1_level_0,the year china discovered america,a beginners guide,a space odyssey a novel by arthur c clarke,allamerican favorites,an action plan to protect yourself your family your assets and your community on january,backup recovery,barbary lane a tales of the city omnibus,beers and a chinese meal,black chicks review flicks a film and video guide with flava,...,zone food blocks the quick and easy mixandmatch counter for staying in the zone,zone perfect meals in minutes fast and simple healthy recipes from the bestselling authorof the zone and mastering the zone,zoo animals a smithsonian guide smithsonian guides,zoology coloring book,zoyas story an afghan womans struggle for freedom,zucchini out west,zuleika dobson penguin modern classics,zulu dawn,â¡dã­melo tu,â¡trato hecho spanish for real life edition
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
73,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
86,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278692,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278771,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278818,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278843,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
book_corr = np.corrcoef(book_rating.T)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [27]:
book_corr.shape

(12020, 12020)

In [28]:
book_list=  list(book_rating)
book_titles =[] 
for i in range(len(book_list)):
    book_titles.append(book_list[i])

In [29]:
#Define Recommendation function
def get_recommendation_collabarative(books_list):
    similar_books = np.zeros(book_corr.shape[0])
    
    for book in books_list:    
        book_index = book_titles.index(book)
        similar_books += book_corr[book_index] 
    book_preferences = []
    for i in range(len(book_titles)):
        book_preferences.append((book_titles[i],similar_books[i]))
        
    return sorted(book_preferences, key= lambda x: x[1], reverse=True)

In [30]:
# make a book list
list_of_books = ['one hundred years of solitude',
                 'stardust',
                 'mogs christmas',
                 'dragonmede',
                 'twopence to cross the mersey',
                 'the candywine development']

In [31]:
books3 = get_recommendation_collabarative(list_of_books)

In [33]:
#top similar books collabarative
i=0
n =0
while n < 9:
    similar_books_to_read= books3[i][0]
    i += 1
    if similar_books_to_read in list_of_books:
        continue
    else:
        print(similar_books_to_read)
        n += 1


  the year china discovered america
 a beginners guide
 a space odyssey  a novel by arthur c clarke
 allamerican favorites
 an action plan to protect yourself your family your assets  and your community on january  
 backup  recovery
 barbary lane  a tales of the city omnibus
 beers and a chinese meal
