In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
import requests
import zipfile
import os
import io

### Data Load

In [2]:
data_path = "./data/book_crossing/"

if not all(os.path.exists(data_path+file) for file in ["BX-Books.csv", "BX-Users.csv", "BX-Book-Ratings.csv"]):
    uni-freiburg.de/~czieg
    url = "http://www2.informatik.ler/BX/BX_CSV-Dump.zip"
    response = requests.get(url)
    zip_file = zipfile.ZipFile(io.BytesIO(response.content))
    zip_file.extractall(data_path)
else:
    print("Data Available. Just Load It!")


books = pd.read_csv(data_path+"BX-Books.csv", sep=";", error_bad_lines=False, warn_bad_lines=False, encoding="latin-1")
users = pd.read_csv(data_path+"BX-Users.csv", sep=";", error_bad_lines=False, warn_bad_lines=False, encoding="latin-1")
ratings = pd.read_csv(data_path+"BX-Book-Ratings.csv", sep=";", error_bad_lines=False, warn_bad_lines=False, encoding="latin-1")


Data Available. Just Load It!




  exec(code_obj, self.user_global_ns, self.user_ns)


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [4]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [5]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [6]:
users['User-ID'].nunique()

278858

In [7]:
ratings['User-ID'].nunique()

105283

In [8]:
ratings['ISBN'].nunique()

340556

In [9]:
books.ISBN.nunique()

271360

### Filtering

In [10]:
# Filter books with at least min_book_ratings ratings
min_book_ratings = 10
book_rating_counts = ratings['ISBN'].value_counts()
filtered_books = book_rating_counts[book_rating_counts>=min_book_ratings].index
ratings = ratings[ratings['ISBN'].isin(filtered_books)]

In [11]:
book_rating_counts.head()

0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
Name: ISBN, dtype: int64

In [12]:
ratings.ISBN.nunique()

18319

In [13]:
# Filter users who rated at least min_user_ratings books
min_user_ratings = 5
user_rating_counts = ratings['User-ID'].value_counts()
filtered_users = user_rating_counts[user_rating_counts >=min_user_ratings].index
ratings = ratings[ratings['User-ID'].isin(filtered_users)]

In [14]:
user_rating_counts.head()

11676     5399
35859     2445
153662    2098
76352     1934
198711    1607
Name: User-ID, dtype: int64

In [15]:
ratings['User-ID'].nunique()

13808

In [16]:
# create user-item matrix
user_item_matrix = ratings.pivot_table(index = "User-ID", columns="ISBN", values="Book-Rating").fillna(0)

In [17]:
user_item_matrix

ISBN,000000000,0002005018,0002251760,0002255081,0002257203,0002259001,0002259834,0002558122,0006172768,0006374921,...,9724119378,9726101794,9726106141,9726116902,9727591965,9727722458,9770390107900,9871138016,9871138148,B00009EF82
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278723,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
user_item_matrix.shape

(13808, 18318)

In [19]:
ratings.shape

(456182, 3)

### Split Dataset

In [20]:
user_indices = np.arange(user_item_matrix.shape[0])
train_user_indices, test_user_indices = train_test_split(user_indices, test_size=0.3, random_state=42)

### Recommendation

#### Random Recommendation

In [21]:
def random_recommendation(ratings, n=10):
    unique_books = ratings['ISBN'].unique()
    random_books = np.random.choice(unique_books, size=n, replace=True)
    return random_books



In [22]:
random_books = random_recommendation(ratings, n=10)
print(random_books)

['0425175367' '0449910830' '3499230933' '0425098680' '0380710722'
 '0316143464' '0553272616' '0515117153' '0553104489' '0380791021']


#### Popularity Recommendation

In [23]:
def popularity_recommendation(ratings, n=10):
    popular_books = ratings.groupby('ISBN')['Book-Rating'].count().sort_values(ascending=False).head(n).index
    return popular_books

In [24]:
popular_books = popularity_recommendation(ratings,n=10)
print(popular_books)


for i in popular_books:
    print(books[books['ISBN']==i]['Book-Title'].values)

Index(['0971880107', '0316666343', '0385504209', '0060928336', '0312195516',
       '0142001740', '0679781587', '044023722X', '0671027360', '067976402X'],
      dtype='object', name='ISBN')
['Wild Animus']
['The Lovely Bones: A Novel']
['The Da Vinci Code']
['Divine Secrets of the Ya-Ya Sisterhood: A Novel']
['The Red Tent (Bestselling Backlist)']
['The Secret Life of Bees']
[]
['A Painted House']
['Angels &amp; Demons']
['Snow Falling on Cedars']


<hr>

In [25]:
def recoomend_books(user_index, strategy, k=10):
    
    if strategy == "random":
        top_k_books = random_recommendation(ratings, n=k)
    elif strategy == "popularity":
        top_k_books = popularity_recommendation(ratings, n=k)
    else:
        raise ValueError("Invalid recommendation strategy")
    
    return top_k_books


In [26]:
def evaluate_model(strategy, k=10):
    true_positive = 0
    false_positive = 0
    false_negative = 0
    
    for user_index in test_user_indices:
        true_books = set(user_item_matrix.iloc[user_index][user_item_matrix.iloc[user_index] > 0].index)
        # user_item_matrix.iloc[user_index] > 0 : 평점을 준 책들 
        recommended_books = set(recoomend_books(user_index, strategy, k))
        
        tp = len(true_books.intersection(recommended_books))
        fp = len(recommended_books - true_books)
        fn = len(true_books - recommended_books)
        
        true_positive += tp
        false_positive += fp
        false_negative += fn
    
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    
    return precision, recall

In [27]:
random_precision, random_recall = evaluate_model(strategy="random")

In [28]:
print(random_precision, random_recall)

0.0005793882625594476 0.0005186385737439222


In [29]:
popularity_precision, popularity_recall = evaluate_model(strategy="popularity")

In [30]:
print(popularity_precision, popularity_recall)

0.018802799903451604 0.01683414370610481
