In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
# Read the books and the ratings data
books = pd.read_csv("./Resources/Books.csv", low_memory = False)
ratings = pd.read_csv("./Resources/Ratings.csv", low_memory = False)

#Combine both into a single DataFrame
book_ratings = books.merge(ratings, how = "outer", on = "ISBN")

# Display data for preview
book_ratings.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,,
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8.0,5.0
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676.0,8.0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,67544.0,8.0
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,116866.0,9.0


In [3]:
book_ratings.dtypes

ISBN                    object
Book-Title              object
Book-Author             object
Year-Of-Publication     object
Publisher               object
User-ID                float64
Book-Rating            float64
dtype: object

In [4]:
book_ratings.count()

ISBN                   555195
Book-Title             505366
Book-Author            505365
Year-Of-Publication    505366
Publisher              505364
User-ID                433671
Book-Rating            433671
dtype: int64

In [5]:
book_ratings = book_ratings.dropna(how = "any")

In [6]:
book_ratings.count()

ISBN                   383839
Book-Title             383839
Book-Author            383839
Year-Of-Publication    383839
Publisher              383839
User-ID                383839
Book-Rating            383839
dtype: int64

In [7]:
book_ratings

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8.0,5.0
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676.0,8.0
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,67544.0,8.0
4,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,116866.0,9.0
5,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,123629.0,9.0
...,...,...,...,...,...,...,...
505352,0395264707,Dreamsnake,Vonda N. McIntyre,1978,Houghton Mifflin,275318.0,10.0
505358,1845170423,Cocktail Classics,David Biggs,2004,Connaught,275970.0,7.0
505360,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books,276313.0,5.0
505361,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),276463.0,7.0


In [8]:
book_ratings["User-ID"] = pd.to_numeric(book_ratings["User-ID"], errors='coerce')

In [9]:
book_ratings["User-ID"] = book_ratings["User-ID"].astype(int)

In [10]:
book_ratings

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5.0
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676,8.0
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,67544,8.0
4,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,116866,9.0
5,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,123629,9.0
...,...,...,...,...,...,...,...
505352,0395264707,Dreamsnake,Vonda N. McIntyre,1978,Houghton Mifflin,275318,10.0
505358,1845170423,Cocktail Classics,David Biggs,2004,Connaught,275970,7.0
505360,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books,276313,5.0
505361,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),276463,7.0


In [11]:
book_ratings = book_ratings.reset_index()

In [12]:
book_ratings.nunique()

index                  383839
ISBN                   149833
Book-Title             135565
Book-Author             62112
Year-Of-Publication       106
Publisher               11574
User-ID                 68091
Book-Rating                10
dtype: int64

In [13]:
# Use Label Encoding from scikit-learn to encode ISBNs which are categorical
from sklearn.preprocessing import LabelEncoder

isbn_encoder = LabelEncoder()
book_ratings["ISBN_encoded"] = isbn_encoder.fit_transform(book_ratings["ISBN"])

# Reverse the encoding - we will use this to reverse the encoding later if needed
# book_ratings['ISBN_decoded'] = isbn_encoder.inverse_transform(book_ratings['ISBN_encoded'])

In [34]:
book_ratings.dtypes

index                    int64
ISBN                    object
Book-Title              object
Book-Author             object
Year-Of-Publication     object
Publisher               object
User-ID                  int32
Book-Rating            float64
ISBN_encoded             int32
dtype: object

In [30]:
display(book_ratings.loc(["ISBN_encoded"] == "074322678X"))


<pandas.core.indexing._LocIndexer at 0x16759b1dfe0>

In [15]:
book_ratings.to_csv("Output/book_ratings.csv", index = False)

In [None]:
n_users = book_ratings["User-ID"].unique().shape[0]
n_items = book_ratings["ISBN_encoded"].unique().shape[0]
n_items = book_ratings["ISBN_encoded"].max()
A = np.zeros((n_users, n_items))

for line in dataset.itertuples():
    A[line[1] - 1, line[2] - 1] = line[3]


In [32]:
non_integer_rows = book_ratings[~book_ratings['ISBN_encoded'].apply(lambda x: isinstance(x, int))]

In [33]:
non_integer_rows

Unnamed: 0,index,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,ISBN_encoded
