# Using KNNBaseline for new user recommendations

In [2]:
# Import libraries

import pandas as pd
from surprise import KNNBaseline
from surprise import Dataset, Reader
from surprise.accuracy import rmse

In [3]:
# Read in data

sample = pd.read_csv('../data/books_reviews_sample.csv')

In [6]:
# reordered columns to make it readable for surprise

ratings = sample[['user_id', 'book_id', 'rating']]

In [7]:
ratings.head(5)

Unnamed: 0,user_id,book_id,rating
0,84f866eb6dae54d7ac52d45a4c9b4d1f,5,4
1,f1b86bf7c103c46fcb854e1fb711b1ec,5,5
2,832c59bc39f8c9a2bc79459ae302b517,5,5
3,f8bf8e54d6de45b52d2286e733271e34,5,5
4,1f7257e13807ad631d90386772e857fa,5,5


In [8]:
ratings.shape

(91567, 3)

In [9]:
# Creater reader

reader = Reader(rating_scale=(1,5))

In [10]:
# Load dataset

dataset = Dataset.load_from_df(ratings,reader)

In [11]:
# Instantiate algorithm

bsl_options = {'method': 'sgd',
               'reg': .08,
               'learning_rate': .005,
               'n_epochs': 40}
              
sim_options = {'name': 'msd',
               'min_support':1,
               'user_based': False}

algo_knn = KNNBaseline(k=40, min_k=2, sim_options = sim_options, bsl_options = bsl_options)

# Retrieve trainset as the entire dataset
trainset = dataset.build_full_trainset()

# Create testset
testset = trainset.build_testset()

# Train on the trainset (dataset)
algo_knn.fit(trainset)

Estimating biases using sgd...
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x12f2d31f0>

In [12]:
preds = algo_knn.test(testset)

In [13]:
rmse(preds)

RMSE: 0.4740


0.47396721637185607

In [14]:
# Create a dictionary to match up book ID with titles

title_to_bookID = pd.Series(sample.book_id.values, index = sample.title).to_dict()

In [15]:
def getBookID(booktitle):
    """Returns the book ID of a given book
    Args: title of book
    Output: book ID
    """
    if str(booktitle) in title_to_bookID:
        return title_to_bookID[str(booktitle)]
    else:
        return ""

In [23]:
# Read in the output dataframe - information of all the books

book_info = pd.read_csv('../data/books_info.csv', index_col=None, 
                     usecols=['book_id', 'Title', 'Author', 'Publication Year'], 
                     dtype = {'Publication Year': 'Int64'})

In [24]:
book_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3512 entries, 0 to 3511
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   book_id           3512 non-null   int64 
 1   Title             3512 non-null   object
 2   Author            3512 non-null   object
 3   Publication Year  3136 non-null   Int64 
dtypes: Int64(1), int64(1), object(2)
memory usage: 113.3+ KB


In [25]:
book_info.head(2)

Unnamed: 0,book_id,Title,Author,Publication Year
0,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling,2004
1,330,On Beyond Zebra!,Dr. Seuss,1955


In [26]:
def book_top_neighbors(book_title):
    
    """
    Returns the most similar books to a given book
    """
    
    # convert title to book id
    book_id = getBookID(book_title)
    
    # get inner ID of book ID
    book_id_inner = trainset.to_inner_iid(book_id)
        
    # get list of inner IDs of neighbors
    neighbors_inner_id = algo_knn.get_neighbors(book_id_inner, k=10)
    
    # get list of raw IDs
    recom_raw_id_list=[]
    for book_id in neighbors_inner_id:
        book_raw_id = trainset.to_raw_iid(book_id)
        recom_raw_id_list.append(book_raw_id)

    # make a dataframe
    df = pd.DataFrame(recom_raw_id_list, index=list(range(1,11)), columns = ['book_id'])
    merged_df = pd.merge(left=df, right=book_info, how='left', on='book_id')
    merged_df.drop(columns='book_id', inplace=True)
    merged_df.index = merged_df.index + 1
    
    return merged_df

In [27]:
# Examples below

book_top_neighbors("The Line")

Unnamed: 0,Title,Author,Publication Year
1,Junie B. Jones and a Little Monkey Business (J...,Barbara Park,
2,"Heaven to Betsy (Betsy-Tacy, #5)",Maud Hart Lovelace,1945.0
3,Mercy Watson to the Rescue (Mercy Watson #1),Kate DiCamillo,2005.0
4,A Long Way from Chicago (A Long Way from Chica...,Richard Peck,2004.0
5,"The Meanest Doll in the World (Doll People, #2)",Ann M. Martin,2005.0
6,When Hitler Stole Pink Rabbit (Out of the Hitl...,Judith Kerr,
7,The Five Chinese Brothers,Claire Huchet Bishop,1996.0
8,"Ivy and Bean (Ivy and Bean, #1)",Annie Barrows,2007.0
9,Tar Beach,Faith Ringgold,1996.0
10,Flotsam,David Wiesner,2006.0


In [28]:
book_top_neighbors("Mr. Wuffles!")

Unnamed: 0,Title,Author,Publication Year
1,The Complete Anne of Green Gables Boxed Set (A...,L.M. Montgomery,1998.0
2,Teach Your Child to Read in 100 Easy Lessons,Siegfried Engelmann,1986.0
3,"On the Banks of Plum Creek (Little House, #4)",Laura Ingalls Wilder,2007.0
4,"By the Shores of Silver Lake (Little House, #5)",Laura Ingalls Wilder,2007.0
5,"Little Town on the Prairie (Little House, #7)",Laura Ingalls Wilder,2007.0
6,The Eleventh Hour,Graeme Base,1993.0
7,"Jane on Her Own (Catwings, #4)",Ursula K. Le Guin,2003.0
8,"The Voyage of the ""Dawn Treader"" (The Chronicl...",C.S. Lewis,1970.0
9,"Ronia, the Robber's Daughter",Astrid Lindgren,1985.0
10,The Care & Keeping of You: The Body Book for G...,Valorie Schaefer,


In [45]:
book_top_neighbors("The Eleventh Hour")

Unnamed: 0,Title,Author,Publication Year
1,"Emily of New Moon (Emily, #1)",L.M. Montgomery,1983.0
2,The Complete Anne of Green Gables Boxed Set (A...,L.M. Montgomery,1998.0
3,The Story of Holly and Ivy,Rumer Godden,2006.0
4,"On the Banks of Plum Creek (Little House, #4)",Laura Ingalls Wilder,2007.0
5,"By the Shores of Silver Lake (Little House, #5)",Laura Ingalls Wilder,2007.0
6,"Little Town on the Prairie (Little House, #7)",Laura Ingalls Wilder,2007.0
7,Midnight for Charlie Bone (The Children of the...,Jenny Nimmo,
8,"Homecoming (Tillerman Cycle, #1)",Cynthia Voigt,2002.0
9,Life Doesn't Frighten Me,Maya Angelou,1996.0
10,Stranger in the Woods: A Photographic Fantasy,Carl R. Sams II,2000.0
