In [22]:
import pandas as pd
import numpy as np
import json

###Exploration of dataset

In [142]:
df = pd.read_csv('data_books.csv', header=None, names=['ReviewerID', 'Book', "Rating"])

In [143]:
df.head()

Unnamed: 0,ReviewerID,Book,Rating
0,276726,Rites of Passage,5
1,276729,Help!: Level 1,3
2,276729,The Amsterdam Connection : Level 4 (Cambridge ...,6
3,276744,A Painted House,7
4,276747,Little Altars Everywhere,9


In [146]:
top_books = pd.value_counts(data.Book)
top_books.head()

The Lovely Bones: A Novel     707
Wild Animus                   581
The Da Vinci Code             494
The Secret Life of Bees       406
The Nanny Diaries: A Novel    393
dtype: int64

In [145]:
top_reviewers = pd.value_counts(df.ReviewerID)
top_reviewers.head()

11676     6943
98391     5691
189835    1899
153662    1845
23902     1180
dtype: int64

### Testing with two books

In [33]:
book_1, book_2 = "Harry Potter and the Chamber of Secrets (Book 2)","Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))"

### Calculation Correlation

In [53]:
# To get the correlation between two books, you are comparing ratings for the two books by all the common reviewers
from scipy.stats.stats import pearsonr

In [151]:
def get_corr(book_1, book_2, df):   

    # get common reviewers who rated both books
    book_1_reviewers = df.ReviewerID[df.Book == book_1]
    book_2_reviewers = df.ReviewerID[df.Book == book_2]
    common_reviewers = set(book_1_reviewers).intersection(book_2_reviewers)
    
    
    mask_1 = (df.ReviewerID.isin(common_reviewers)) & (df.Book==book_1)
    mask_2 = (df.ReviewerID.isin(common_reviewers)) & (df.Book==book_2)
    book_1_ratings = df[mask_1].sort('ReviewerID')
    book_2_ratings = df[mask_2].sort('ReviewerID')
    

    # remove mulitiple reviews by same reviewer for same book
    book_1_ratings  = book_1_ratings[book_1_ratings.ReviewerID.duplicated()==False].Rating
    book_2_ratings  = book_2_ratings[book_2_ratings.ReviewerID.duplicated()==False].Rating

    return pearsonr(book_1_ratings, book_2_ratings)[0]

### Many books

In [154]:
# top 10 books
top_books_all = pd.DataFrame({'count' : data.groupby(["Book"]).size()}).reset_index().sort(['count'],ascending=False)

In [174]:
top_20 = top_books_all.Book[0:20]

In [260]:
corr = []
for book1 in top_20:
    for book2 in top_20:
        if book1 != book2:
            corr.append([book1, book2, get_corr(book1, book2, df)])

In [261]:
colNames = ["Book1", "Book2", "Correlation"]
correlations = pd.DataFrame(corr, columns=colNames).sort('Correlation', ascending=False)

In [262]:
correlations.head()

Unnamed: 0,Book1,Book2,Correlation
336,Harry Potter and the Prisoner of Azkaban (Book 3),The Summons,0.963087
263,The Summons,Harry Potter and the Prisoner of Azkaban (Book 3),0.963087
370,The Pilot's Wife : A Novel,Harry Potter and the Chamber of Secrets (Book 2),0.874818
189,Harry Potter and the Chamber of Secrets (Book 2),The Pilot's Wife : A Novel,0.874818
237,Harry Potter and the Sorcerer's Stone (Harry P...,Harry Potter and the Chamber of Secrets (Book 2),0.869647


### Build Recommendation

In [294]:
def getRec(book, correlations, books):
    recs = []
    for b in books:
        if b != book:
            recs.append([b, float(correlations[(correlations.Book1 == book) & (correlations.Book2 == b)].Correlation)])
    recs = sorted(recs, key=lambda x: x[1], reverse=True)
    return recs

In [295]:
getRec("Harry Potter and the Chamber of Secrets (Book 2)", correlations, top_20)

[["The Pilot's Wife : A Novel", 0.8748177652797063],
 ["Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))",
  0.8696473664533657],
 ['The Summons', 0.862441481918926],
 ['The Notebook', 0.7938566201357354],
 ['Harry Potter and the Prisoner of Azkaban (Book 3)', 0.614027793363417],
 ['Divine Secrets of the Ya-Ya Sisterhood: A Novel', 0.5446056340098738],
 ['Snow Falling on Cedars', 0.39270020452103155],
 ['The Lovely Bones: A Novel', 0.3777910514099126],
 ['A Painted House', 0.3547874375934496],
 ['Wild Animus', 0.25045121865651904],
 ['The Secret Life of Bees', 0.19561519910898786],
 ['The Red Tent (Bestselling Backlist)', 0.1401379636678026],
 ['Girl with a Pearl Earring', 0.12570343830542532],
 ['The Da Vinci Code', 0.10891980142278976],
 ['The Nanny Diaries: A Novel', 0.003192949577478907],
 ["Where the Heart Is (Oprah's Book Club (Paperback))", -0.0448227403937221],
 ["Bridget Jones's Diary", -0.07572126413865359],
 ['Angels &amp', -0.1246190951022249],
 ['Life of Pi