## Importing the libraries and data

In [1]:
import pandas as pd
import numpy as np
import os
pd.options.display.max_rows=350
pd.options.display.max_columns=40


import warnings
warnings.filterwarnings('ignore')

In [2]:
#df=pd.read_csv('CHILDREN_TOTAL_FOR_RECOMMENDER.csv')
df=pd.read_csv('FINAL_COMPLETE_DF.csv')

In [3]:
df.columns

Index(['user_id', 'book_id', 'review_id', 'rating', 'review_text', 'isbn',
       'text_reviews_count', 'is_ebook', 'average_rating', 'description',
       'format', 'publisher', 'num_pages', 'isbn13', 'publication_year',
       'ratings_count', 'title', 'descriptiondetect', 'titledetect',
       'review_text_detect'],
      dtype='object')

## Transforming the data and creating the sparse matrix

In [4]:
from scipy.sparse import csr_matrix

def create_X(df):
    """
    Generates a sparse matrix from ratings dataframe.
    
    Args:
        df: pandas dataframe
    
    Returns:
        X: sparse matrix
        user_mapper: dict that maps user id's to user indices
        user_inv_mapper: dict that maps user indices to user id's
        book_mapper: dict that maps book id's to book indices
        book_inv_mapper: dict that maps book indices to book id's
    """
    N = df['user_id'].nunique()
    M = df['book_id'].nunique()

    user_mapper = dict(zip(np.unique(df["user_id"]), list(range(N))))
    book_mapper = dict(zip(np.unique(df["book_id"]), list(range(M))))
    
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["user_id"])))
    book_inv_mapper = dict(zip(list(range(M)), np.unique(df["book_id"])))
    
    user_index = [user_mapper[i] for i in df['user_id']]
    book_index = [book_mapper[i] for i in df['book_id']]
    print(len(user_index))
    X = csr_matrix((df["rating"], (book_index, user_index)), shape=(M, N))
    
    return X, user_mapper, book_mapper, user_inv_mapper, book_inv_mapper

In [5]:
df['book_id'].nunique()

60004

In [6]:
df['user_id'].nunique()

67843

In [7]:
X, user_mapper, book_mapper,user_inv_mapper, book_inv_mapper = create_X(df)

496634


In [8]:
X

<60004x67843 sparse matrix of type '<class 'numpy.int64'>'
	with 496634 stored elements in Compressed Sparse Row format>

In [9]:
book_mapper

{5: 0,
 50: 1,
 61: 2,
 93: 3,
 236: 4,
 241: 5,
 244: 6,
 314: 7,
 329: 8,
 330: 9,
 333: 10,
 334: 11,
 335: 12,
 378: 13,
 881: 14,
 883: 15,
 903: 16,
 1163: 17,
 1165: 18,
 1166: 19,
 1852: 20,
 1931: 21,
 2086: 22,
 2231: 23,
 2257: 24,
 2316: 25,
 2384: 26,
 2391: 27,
 2404: 28,
 2718: 29,
 2841: 30,
 2848: 31,
 2851: 32,
 2862: 33,
 2998: 34,
 3008: 35,
 3012: 36,
 3014: 37,
 3119: 38,
 3120: 39,
 3143: 40,
 3144: 41,
 3150: 42,
 3155: 43,
 3156: 44,
 3325: 45,
 3408: 46,
 3540: 47,
 3562: 48,
 3570: 49,
 3579: 50,
 3580: 51,
 3636: 52,
 3638: 53,
 3685: 54,
 3742: 55,
 3743: 56,
 3782: 57,
 3783: 58,
 3791: 59,
 3797: 60,
 3799: 61,
 3802: 62,
 3839: 63,
 3884: 64,
 3894: 65,
 3987: 66,
 4117: 67,
 4352: 68,
 4357: 69,
 4479: 70,
 4523: 71,
 4528: 72,
 4536: 73,
 4538: 74,
 4540: 75,
 4541: 76,
 4600: 77,
 4606: 78,
 4735: 79,
 4821: 80,
 4825: 81,
 4947: 82,
 4948: 83,
 4967: 84,
 5019: 85,
 5202: 86,
 5209: 87,
 5212: 88,
 5291: 89,
 5321: 90,
 5370: 91,
 5371: 92,
 5401: 93

Let's check out the sparsity of our X matrix

In [10]:
X.shape
#[0]
X.shape[1]

67843

In [11]:
X.count_nonzero()

481022

In [12]:
X.shape[0]*X.shape[1]

4070851372

In [13]:
sparsity = X.count_nonzero()/(X.shape[0]*X.shape[1])

print("Matrix sparsity:", round(sparsity*100,2))

Matrix sparsity: 0.01


Only 0.01% of cells in our user-item matrix are populated with ratings.

### Writing our matrix to a file

We're going to save our user-item matrix. Since our matrix is represented as a scipy sparse matrix, we can use the [scipy.sparse.save_npz](https://docs.scipy.org/doc/scipy-1.1.0/reference/generated/scipy.sparse.load_npz.html) method to write the matrix to a file. 

In [14]:
from scipy.sparse import save_npz

save_npz('user_item_matrix_books.npz', X)

##  Finding similar books using k-Nearest Neighbours

This approach looks for the $k$ nearest neighbours of a given book by identifying $k$ points in the dataset that are closest to book $m$. 
kNN makes use of distance metrics such as:

1. Cosine similarity
2. Euclidean distance
3. Manhattan distance
4. Pearson correlation 

We are working in a M-dimensional space where M represents the number of movies in our X matrix. 

###  Let's fit our model: NearestNeighbors

In [15]:
from sklearn.neighbors import NearestNeighbors

kNN = NearestNeighbors(n_neighbors=10,  metric='cosine')
kNN
kNN.fit(X)

NearestNeighbors(metric='cosine', n_neighbors=10)

### Let's define the book that we want to assess

In [16]:
book_mapper;

In [17]:
book_titles = dict(zip(df['book_id'], df['title']))
book_titles

{3636: 'The Giver (The Giver, #1)',
 11387515: 'Wonder (Wonder #1)',
 5: 'Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)',
 157993: 'The Little Prince',
 38709: 'Holes (Holes, #1)',
 370493: 'The Giving Tree',
 389627: 'Diary of a Wimpy Kid (Diary of a Wimpy Kid, #1)',
 9673436: 'The Invention of Hugo Cabret',
 24178: "Charlotte's Web",
 78411: 'The Bad Beginning (A Series of Unfortunate Events, #1)',
 8127: 'Anne of Green Gables (Anne of Green Gables, #1)',
 11594337: 'The One and Only Ivan',
 2998: 'The Secret Garden',
 50: "Hatchet (Brian's Saga, #1)",
 30119: 'Where the Sidewalk Ends',
 19543: 'Where the Wild Things Are',
 10365: 'Where the Red Fern Grows',
 83369: 'The Mysterious Benedict Society (The Mysterious Benedict Society, #1)',
 37190: 'The Tale of Despereaux',
 378: 'The Phantom Tollbooth',
 23772: 'Green Eggs and Ham',
 37186: 'The Miraculous Journey of Edward Tulane',
 6319: 'The BFG',
 357664: 'Because of Winn-Dixie',
 236093: 'The Wonderful Wizard of Oz (

In [18]:
book_title = book_titles[6310]
book_title

'Charlie and the Chocolate Factory (Charlie Bucket, #1)'

In [19]:
book_id = 6310
book_ind = book_mapper[book_id]
book_ind #the value of the book id in our sparse matrix

116

In [20]:
book_to_assess = X[book_ind]
book_to_assess

<1x67843 sparse matrix of type '<class 'numpy.int64'>'
	with 730 stored elements in Compressed Sparse Row format>

In [21]:
 neighbour = kNN.kneighbors(book_to_assess, return_distance=False) #Finds the K-neighbors of a point.

In [22]:
neighbour

array([[  116,   135,   136,   118,   139,   117,   537,   710,   131,
        21045]], dtype=int64)

### Coming back to the original id from the book

In [23]:
book_inv_mapper[116]

6310

In [24]:
neighbour_ids = []
for i in range(1,10):
    n = neighbour.item(i) 
    neighbour_ids.append(book_inv_mapper[n])
    
neighbour_ids

[6687, 6689, 6327, 6693, 6319, 24178, 31456, 6670, 1728744]

In [25]:
print("Because you read:",book_title)
print('******************************************')
for i in neighbour_ids:
    print(book_titles[i])

Because you read: Charlie and the Chocolate Factory (Charlie Bucket, #1)
******************************************
Charlie and the Great Glass Elevator (Charlie Bucket, #2)
James and the Giant Peach
The Witches
Fantastic Mr. Fox
The BFG
Charlotte's Web
The Twits
The Magic Finger
Sleeping Beauty (Disney Princess, 5)


## Creating a function to get the recommendations

In [26]:
def get_recommendations(book_id, X, k,
                        metric='cosine'):
    """
    Finds k-nearest neighbours for a given movie id.
    
    Args:
        book_id: id of the movie of interest
        X: user-item utility matrix
        k: number of similar books to retrieve
        metric: distance metric for kNN calculations
    
    Returns:
        list of k similar book ID's
    """
    neighbour_ids = []
    
    book_ind = book_mapper[book_id]
    book_to_assess = X[book_ind]
    kNN = NearestNeighbors(n_neighbors=k, 
                           metric=metric)
    kNN.fit(X)

    neighbour = kNN.kneighbors(book_to_assess, return_distance=False)
    
    # We need to map each neighbour id with the right book_id
    for i in range(1,k):
        n = neighbour.item(i) #
        neighbour_ids.append(book_inv_mapper[n])

    
    
    book_titles = dict(zip(df['book_id'], df['title']))
    book_title = book_titles[book_id]
    
    print(f"Because you read", book_title, "you would love:")
    print('*********************************************************')
    for i in neighbour_ids:
        print(book_titles[i])
    
    #return neighbour_ids

## Getting some recommendations

In [28]:
book_id = 6310

get_recommendations(book_id, X, k=6)

Because you read Charlie and the Chocolate Factory (Charlie Bucket, #1) you would love:
*********************************************************
Charlie and the Great Glass Elevator (Charlie Bucket, #2)
James and the Giant Peach
The Witches
Fantastic Mr. Fox
The BFG


In [29]:
book_id = 19543

get_recommendations(book_id, X, k=6)

Because you read Where the Wild Things Are you would love:
*********************************************************
The Snowy Day
The Very Hungry Caterpillar
Green Eggs and Ham
Goodnight Moon
The Cat in the Hat


In [30]:
book_id = 13194740

get_recommendations(book_id, X, k=6)

Because you read Gnarbunga you would love:
*********************************************************
The Invisible Princess
Ramadan Moon
Cousin Ruth's Tooth
Dark Night
One North Star: A Counting Book


In [33]:
book_id = 275325

get_recommendations(book_id, X, k=6)

Because you read The Butter Battle Book you would love:
*********************************************************
Little Bird's ABC
The Diary of Stuffles B. Snippet
Sunshine & Storm
A Manatee Morning
Raven and River


In [34]:
book_id = 3636

get_recommendations(book_id, X, k=6)

Because you read The Giver (The Giver, #1) you would love:
*********************************************************
Wonder (Wonder #1)
Holes (Holes, #1)
Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)
The Invention of Hugo Cabret
The Bad Beginning (A Series of Unfortunate Events, #1)


In [35]:
book_id = 20549446

get_recommendations(book_id, X, k=6)

Because you read Mix It Up! you would love:
*********************************************************
Telephone
Flashlight
My Grandfather's Coat
Hello, My Name is Ruby
What's New? The Zoo!: A Zippy History of Zoos


In [36]:
book_id = 11233988

get_recommendations(book_id, X, k=6)

Because you read I Want My Hat Back you would love:
*********************************************************
This is Not My Hat
Extra Yarn
The Dark
Mr. Tiger Goes Wild
Sam and Dave Dig a Hole


In [37]:
book_id = 296944
get_recommendations(book_id, X, k=6)

Because you read Clocks and More Clocks you would love:
*********************************************************
Toolbox Twins
Clocks and More Clocks
Polar Star
Dudley and the Toy Keeper's Chest
The Story of Robin Hood


In [38]:
book_id = book_id = 19321

get_recommendations(book_id, X, k=6)

Because you read The Tale of Peter Rabbit you would love:
*********************************************************
The Tale of Squirrel Nutkin
The Tale of Benjamin Bunny
The Tale of Jemima Puddle-Duck
The Tale of Two Bad Mice
The Tale of Tom Kitten


In [39]:
book_id = 28507895

get_recommendations(book_id, X, k=6)

Because you read Ada Twist, Scientist you would love:
*********************************************************
Maybe Something Beautiful: How Art Transformed a Neighborhood
The Noisy Paint Box
The Wheels on the Tuk Tuk
Are We There Yet?
Ideas Are All Around
