In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix

In [94]:
path = 'merged_df.csv'
df = pd.read_csv(path)
user_book_mat = df.pivot_table(columns='isbn', index='user_id', values='rating')
user_book_mat.fillna(0, inplace=True)

In [95]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm='auto')
# inside cse_matrix so that it doesn't calculate 0's as a rating
user_book_csr = csr_matrix(user_book_mat)
model.fit(user_book_csr)


In [96]:
distance, suggestions = model.kneighbors(user_book_mat.iloc[0,:].values.reshape(1,-1), n_neighbors=10 )
print(distance)
print(suggestions)
print("Similar users: ")
for i in range(len(suggestions)):
    print(user_book_mat.index[suggestions[i]])


[[ 0.         26.19160171 28.08914381 29.52964612 29.76575213 29.79932885
  29.86636905 30.29851482 30.33150178 30.49590136]]
[[   0 1061  733 1196 1038    7 1373 1180  432  662]]
Similar users: 
Index([254, 183088, 130554, 208141, 179978, 2033, 239423, 205735, 79186,
       118533],
      dtype='int64', name='user_id')


In [97]:
def pearson_correlation_coefficients(user_book_pivot_table, user_id):
    user_rating = user_book_mat.loc[user_id]
    user_rating = user_rating - user_rating.mean()

    correlations = []
    for other_user_id, other_user_ratings in user_book_pivot_table.iterrows():
        if other_user_id != user_id:
            # getting the  isbns of common ratings 
            common_ratings = user_rating.notna() & other_user_ratings.notna()
            # print(common_ratings)
            if len(common_ratings) > 0:
                other_user_ratings = other_user_ratings - other_user_ratings[common_ratings].mean()
                numerator = sum(user_rating[common_ratings] * other_user_ratings[common_ratings] )
                denominator = ((sum((user_rating)**2))**0.5) * ((sum((other_user_ratings)**2))**0.5)
                correlation = numerator / denominator if denominator != 0 else 0

                correlations.append([user_id, other_user_id, correlation])

    columns = ['input_user_id', 'other_user_id', 'correlation']
    correlation_df = pd.DataFrame(correlations, columns=columns)

    return correlation_df
print(pearson_correlation_coefficients(user_book_mat, 3363))

      input_user_id  other_user_id  correlation
0              3363            254    -0.010136
1              3363            507    -0.004776
2              3363            882    -0.008582
3              3363           1424    -0.007868
4              3363           1435     0.161068
...             ...            ...          ...
1610           3363         277478     0.000000
1611           3363         277639    -0.007317
1612           3363         278137    -0.005573
1613           3363         278188    -0.006695
1614           3363         278418    -0.006798

[1615 rows x 3 columns]


In [98]:
df.columns

Index(['Unnamed: 0', 'user_id', 'location', 'age', 'isbn', 'rating',
       'book_title', 'book_author', 'year_of_publication', 'publisher',
       'Language', 'Category', 'num_rating'],
      dtype='object')

In [99]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [100]:
ratings = df[['user_id', 'isbn', 'rating']]

In [101]:
def get_recomendable_books(closest_neighbor_ratings, user_id):
    closest_neighbor_ratings['weighted_rating'] = closest_neighbor_ratings['correlation'] * closest_neighbor_ratings['rating']
    weighted_avg_rating = (
        closest_neighbor_ratings.groupby('isbn')['weighted_rating'].sum() /
        closest_neighbor_ratings.groupby('isbn')['correlation'].sum()
    ).round(2)
    target_user_ratings = df[df['user_id'] == user_id]
    readed_books = target_user_ratings['isbn'].unique()
    books_recommendable = weighted_avg_rating[~weighted_avg_rating.index.isin(readed_books)].to_frame()
    # converting series to dataframe
    books_recommendable = books_recommendable.reset_index()
    books_recommendable.columns = ['isbn', 'weighted_avg_rating']
    return books_recommendable

In [102]:
def recommend_10_books(correlations, n,  user_id):
    closest_neighbors = correlations.nlargest(n*3, 'correlation')
    closest_neighbor_ratings = pd.merge(closest_neighbors, df, left_on='other_user_id', right_on='user_id')
    
    books_recommendable = get_recomendable_books(closest_neighbor_ratings, user_id) 
    # print(books_recommendable.head())
    
    top_10_recommendations = pd.merge(books_recommendable.nlargest(n, 'weighted_avg_rating'), df.drop_duplicates(subset='isbn'), on='isbn')
    
    return top_10_recommendations

# 10 Recommendations for User

In [103]:
user_id = int(input("Enter the user_id: "))
correlations = pearson_correlation_coefficients(user_book_mat, user_id)
top_10_recommendations = recommend_10_books(correlations, 10, user_id)
top_10_recommendations[['isbn', 'book_title','book_author', 'weighted_avg_rating']].head(15)

Unnamed: 0,isbn,book_title,book_author,weighted_avg_rating
0,0140042598,On the Road,Jack Kerouac,10.0
1,0380731851,Mystic River,Dennis Lehane,10.0
2,043935806X,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,10.0
3,0451526341,Animal Farm,George Orwell,10.0
4,0553582755,One Door Away from Heaven,Dean R. Koontz,10.0
5,0671042262,The Blue Nowhere : A Novel,Jeffery Deaver,10.0
6,0312422156,Middlesex: A Novel,Jeffrey Eugenides,9.0
7,0312995423,Digital Fortress : A Thriller,Dan Brown,9.0
8,0316788228,The Pilot's Wife,Anita Shreve,9.0
9,0380727501,Notes from a Small Island,Bill Bryson,9.0


# Rating Prediction From UserID and ISBN

In [104]:
def filter_users_by_isbn(target_isbn, whole_df):
    target_ratings = whole_df[(whole_df['isbn'] == target_isbn) & (whole_df['rating'] > 0)]

    if target_ratings.empty:
        return whole_df

    unique_user_ids = target_ratings['user_id'].unique()
    filtered_df = whole_df[whole_df['user_id'].isin(unique_user_ids)]
    return filtered_df

In [187]:
def predict_rating(user_id, isbn, filt_user_book_mat, correlations_df, n):
    top_correlations = correlations_df[correlations_df['input_user_id'] == user_id].nlargest(n, 'correlation')
    print(top_correlations)
    # length = 
    numerator = sum((top_correlations['correlation']*(filt_user_book_mat.loc[top_correlations['other_user_id'], isbn]).values))
    denominator = sum(top_correlations['correlation'])
    print((filt_user_book_mat.loc[top_correlations['other_user_id'], isbn]))
    if denominator == 0:
        return None
    print(numerator, denominator)
    predicted_rating = numerator / denominator
    return predicted_rating

In [191]:
user_id = 278418
isbn = '0060392452'
n = 5
filtered_df = filter_users_by_isbn(isbn, df)
if filtered_df.equals(df):
    print("There are no users who has rated this book.")
    
else:
    filtered_df.head()
    filt_user_book_mat = filtered_df.pivot_table(columns='isbn', index='user_id', values='rating')
    filt_user_book_mat.fillna(0, inplace=True)
correlations = pearson_correlation_coefficients(filt_user_book_mat, user_id)
predicted_rating = predict_rating(user_id, isbn, filt_user_book_mat, correlations, n)
if predicted_rating is not None: 
    print("Predicted Rating of user",user_id,"for",isbn,"is",predicted_rating)
else:
    print(predicted_rating)
    

    input_user_id  other_user_id  correlation
1          278418           6251     0.088112
47         278418         245410     0.060076
14         278418          75591     0.044103
11         278418          60244     0.036891
46         278418         234828     0.022668
user_id
6251      10.0
245410     8.0
75591     10.0
60244      8.0
234828    10.0
Name: 0060392452, dtype: float64
2.3245643321345293 0.25184970454787514
Predicted Rating of user 278418 for 0060392452 is 9.229966484604882


In [192]:
filtered_df.columns

Index(['user_id', 'location', 'age', 'isbn', 'rating', 'book_title',
       'book_author', 'year_of_publication', 'publisher', 'Language',
       'Category', 'num_rating'],
      dtype='object')