In [1]:
!pip install happybase



In [2]:
import happybase
import pandas as pd

In [3]:
connection = happybase.Connection('localhost', 9090)
books = connection.table('books')
users = connection.table('users')
ratings = connection.table('ratings')

In [4]:
users_data = []
books_data = []
ratings_data = []

for key, data in books.scan():
    row_data = {'row_key': key.decode('utf-8')}
    for col, value in data.items():
        row_data[col.decode('utf-8').replace('info:', '')] = value.decode('utf-8')
    books_data.append(row_data)

for key, data in users.scan():
    row_data = {'row_key': key.decode('utf-8')}
    for col, value in data.items():
        row_data[col.decode('utf-8').replace('info:', '')] = value.decode('utf-8')
    users_data.append(row_data)
    
for key, data in ratings.scan():
    row_data = {'row_key': key.decode('utf-8')}
    for col, value in data.items():
        row_data[col.decode('utf-8').replace('info:', '')] = value.decode('utf-8')
    ratings_data.append(row_data)

In [5]:
books_data[2]

{'row_key': '10',
 'Book-Author': 'David Adams Richards',
 'Book-Title': 'Nights Below Station Street',
 'ISBN': '0771074670',
 'Image-URL-L': 'http://images.amazon.com/images/P/0771074670.01.LZZZZZZZ.jpg',
 'Image-URL-M': 'http://images.amazon.com/images/P/0771074670.01.MZZZZZZZ.jpg',
 'Image-URL-S': 'http://images.amazon.com/images/P/0771074670.01.THUMBZZZ.jpg',
 'Publisher': 'Emblem Editions',
 'Year-Of-Publication': '1988'}

In [6]:
df_books = pd.DataFrame(books_data)
df_users = pd.DataFrame(users_data)
df_ratings = pd.DataFrame(ratings_data)

In [8]:
df_books.head()

Unnamed: 0,row_key,Book-Author,Book-Title,ISBN,Image-URL-L,Image-URL-M,Image-URL-S,Publisher,Year-Of-Publication
0,0,Mark P. O. Morford,Classical Mythology,195153448,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,Oxford University Press,2002
1,1,Richard Bruce Wright,Clara Callan,2005018,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,HarperFlamingo Canada,2001
2,10,David Adams Richards,Nights Below Station Street,771074670,http://images.amazon.com/images/P/0771074670.0...,http://images.amazon.com/images/P/0771074670.0...,http://images.amazon.com/images/P/0771074670.0...,Emblem Editions,1988
3,100,William Abrahams,"Prize Stories, 1987: The O'Henry Awards",385235941,http://images.amazon.com/images/P/0385235941.0...,http://images.amazon.com/images/P/0385235941.0...,http://images.amazon.com/images/P/0385235941.0...,Doubleday Books,1987
4,1000,Meg Cabot,All-American Girl,64472779,http://images.amazon.com/images/P/0064472779.0...,http://images.amazon.com/images/P/0064472779.0...,http://images.amazon.com/images/P/0064472779.0...,HarperTrophy,2003


In [None]:
df_ratings['Book-Rating'] = df_ratings['Book-Rating'].astype('int64')

In [None]:
# Merge books and ratings on ISBN
merged_data = pd.merge(df_ratings, df_books, on='ISBN', how='inner')

# Calculate the number of users and average rating for each book
book_stats = merged_data.groupby('ISBN').agg({'User-ID': 'count', 'Book-Rating': 'mean'}).reset_index()

# Merge the book_stats with the original books DataFrame
final_data = pd.merge(df_books, book_stats, on='ISBN', how='left')

# Rename the columns for clarity
final_data.rename(columns={'User-ID': 'num_users', 'Book-Rating': 'average_rating'}, inplace=True)

# Fill NaN values with 0 for books with no ratings
final_data.fillna(0, inplace=True)

# Display the first few rows of the final DataFrame
final_data.head()

In [None]:
final_data.iloc[1]

In [None]:
final_data_sorted = final_data.sort_values(by='num_users', ascending=False)
# Hapus baris dengan rating rata-rata 0 dan jumlah rating 0
final_data_sorted = final_data_sorted.loc[(final_data_sorted['num_users'] != 0) & (final_data_sorted['average_rating'] != 0)]

# Display the sorted DataFrame
final_data_sorted[149830:]

In [None]:
# Calculate the mean after cleaning the data
C = final_data_sorted['average_rating'].mean()
m = final_data_sorted['num_users'].quantile(0.90)
print(C, m)

In [None]:
# Filter out all qualified movies into a new DataFrame
q_books = final_data_sorted.copy().loc[final_data_sorted['num_users'] >= m]
q_books.shape

In [None]:
# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x['num_users']
    R = x['average_rating']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_books['score'] = q_books.apply(weighted_rating, axis=1)

In [None]:
#Sort movies based on score calculated above
q_books = q_books.sort_values('score', ascending=False)

#Print the top 15 movies
q_books[['Book-Title', 'num_users', 'average_rating', 'score']].head(20)


In [None]:
q_books.to_csv(r"recomender_result.csv", index=False, sep="|")