In [0]:
# Google Cloud things

project_id = 'cs229-project-258114'
bucket_name = 'goodreads_reviews'

from google.colab import auth
auth.authenticate_user()

!gcloud config set project {project_id}

# Download the file from a given Google Cloud Storage bucket.
!gsutil cp gs://{bucket_name}/finalbooks.csv /tmp/finalbooks.csv
!gsutil cp gs://{bucket_name}/finalratings.csv /tmp/ratings.csv 
!gsutil cp gs://{bucket_name}/train.csv /tmp/train.csv 
!gsutil cp gs://{bucket_name}/test.csv /tmp/test.csv 
#!gsutil cp gs://{bucket_name}/Bayes.csv /tmp/Bayes.csv 
## !gsutil cp gs://{bucket_name}/Popularity/goodreads-best-books.zip /tmp/best.zip

# basics
import pandas as pd
import numpy as np
import collections

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# tools
import scipy
import math
import random
import sklearn
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

import nltk
nltk.download('stopwords')

finalbooks = pd.read_csv('../tmp/finalbooks.csv')
ratings = pd.read_csv('../tmp/ratings.csv')
train = pd.read_csv('../tmp/train.csv')
test = pd.read_csv('../tmp/test.csv')
#bayes = pd.read_csv('../tmp/Bayes.csv')

In [0]:
## NDGC & RMSE

def dcg_k(r, k):
    """ Discounted Cumulative Gain (DGC)  
    Args:
        r: True Ratings in Predicted Rank Order (1st element is top recommendation)
        k: Number of results to consider
    Returns:
        DCG
    """
  
    r = np.asfarray(r)[:k]
    return np.sum(2**r / np.log2(np.arange(2, r.size + 2)))      



def ndcg_k(r, k):
    """Normalized Discounted Cumulative Gain (NDCG)
    Args:
        r: True Ratings in Predicted Rank Order (1st element is top recommendation)
        k: Number of results to consider
    Returns:
        NDCG
    """
    dcg_max = dcg_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_k(r, k) / dcg_max

def mean_ndcg(rs):
    """Mean NDCG for all users
    Args:
        rs: Iterator / For each user: True Ratings in Predicted Rank Order
    Returns:
        Mean NDCG
    """
    return np.mean([ndcg_k(r, len(r)) for r in rs])

def rmse(y,h):
    """RMSE
    Args:
        y: real y
        h: predicted y
    Returns:
        RMSE
    """
    a = y-h

    return np.sqrt(sum(a**2)/len(a))

In [0]:
## DEFINING THE TAIL
tailcomp = ratings.groupby(by= 'newbookid', as_index=False).agg({'rating':pd.Series.count}).sort_values(by = 'rating', ascending = False)
tot = sum(tailcomp['rating'])
tailcomp['popshare']= [x/tot for x in tailcomp['rating']]
tailcomp['popshare']= tailcomp['popshare'].cumsum()
tailcomp['category']= ['Head' if x<0.95 else "Tail" for x in tailcomp['popshare']]

tail = tailcomp.loc[tailcomp.popshare >= 0.95]
tail

In [0]:
def get_words(message):
    """Get the normalized list of words from a message string.

    This function should split a message into words, normalize them, and return
    the resulting list. For splitting, you should split on spaces. For normalization,
    you should convert everything to lowercase.

    Args:
        message: A string containing an SMS message

    Returns:
       The list of normalized words from the message.
    """


    words = message
    words = words.split(" ")
    words = [x.lower() for x in words]

    return words



def create_dictionary(messages):
    """Create a dictionary mapping words to integer indices.

    Args:
        messages: A list of strings containing SMS messages

    Returns:
        A python dict mapping words to integers.
    """

    word_counts = collections.defaultdict(int)

    for message in messages:
        for word in set(get_words(message)):
            word_counts[word] += 1

    resulting_dictionary = {}

    for word, count in word_counts.items():
        if count >= 25 and word not in stopwords.words('english') and len(word) > 1:
            next_index = len(resulting_dictionary)
            resulting_dictionary[word] = next_index

    return resulting_dictionary




def transform_text(messages, word_dictionary):
    """Transform a list of text messages into a numpy array for further processing.

    Args:
        messages: A list of strings where each string is an SMS message.
        word_dictionary: A python dict mapping words to integers.

    Returns:
        A numpy array marking the words present in each message.
        Where the component (i,j) is the number of occurrences of the
        j-th vocabulary word in the i-th message.
    """

    A = np.zeros((len(messages), len(word_dictionary)))

    for i, message in enumerate(messages):
        for word in get_words(message):
            if word in word_dictionary:
                A[i, word_dictionary[word]] += 1

    return A



def fit_naive_bayes_model(matrix, labels):
    """Fit a naive bayes model.

    Args:
        matrix: A numpy array containing word counts for the training data
        labels: The binary (0 or 1) labels for that training data

    Returns: The trained model
    """


    model = {}

    phi = (1. * sum(labels) / len(labels))*0.95+0.05*0.5
    model['logphi_0'] = np.log(1.-phi)
    model['logphi_1'] = np.log(phi)
    theta_0 = (matrix[labels == 0]).sum(axis=0) + 1
    theta_1 = (matrix[labels == 1]).sum(axis=0) + 1
    theta_0 /= theta_0.sum()
    theta_1 /= theta_1.sum()
    model['logtheta_0'] = np.log(theta_0)
    model['logtheta_1'] = np.log(theta_1)

    return model



def predict_from_naive_bayes_model(model, matrix):
    """Use a Naive Bayes model to compute predictions for a target matrix.

    Args:
        model: A trained model from fit_naive_bayes_model
        matrix: A numpy array containing word counts

    Returns: A numpy array containg the predictions from the model
    """
 
    output = np.zeros(matrix.shape[0])

    logphi_0 = model['logphi_0']
    logphi_1 = model['logphi_1']
    logtheta_0 = model['logtheta_0']
    logtheta_1 = model['logtheta_1']
    logprobs_0 = (matrix * logtheta_0).sum(axis=1) + logphi_0
    logprobs_1 = (matrix * logtheta_1).sum(axis=1) + logphi_1

    output = (logprobs_1/(logprobs_1+logprobs_0))
    return output



def get_top_five_naive_bayes_words(model, dictionary):
    """
    Args:
        model: The Naive Bayes model returned from fit_naive_bayes_model
        dictionary: A mapping of word to integer ids

    Returns: A list of the top five most indicative words in sorted order with the most indicative first
    """

    ids = np.argsort(model['logtheta_0'] - model['logtheta_1'])[:5]

    reverse_dictionary = {i: word for word, i in dictionary.items()}

    return [reverse_dictionary[i] for i in ids]
  
 


In [0]:
import collections
finalbooks['snippet'] = finalbooks['snippet'].fillna(finalbooks['title'])
finalbooks['snippet'] = finalbooks['snippet'].str.replace(r'[^\w\s]',"")
finalbooks['snippet'] = finalbooks['snippet'].fillna(finalbooks['tag_cloud'])
#finalbooks['tag_cloud'] = finalbooks['tag_cloud'].str.replace('-'," ")
#finalbooks['words'] = finalbooks['snippet'] +" "+finalbooks['tag_cloud']+" "+finalbooks['first_author']
dico = create_dictionary(finalbooks['snippet'])
dico

In [0]:
len(dico)

In [0]:
A = transform_text(finalbooks['snippet'], dico)
finalbooks['binary']= [1 if x >=4 else 0 for x in finalbooks['average_rating']]
ratings['binary']= [1 if x >=4 else 0 for x in ratings['rating']]

In [0]:
allpreds = []
topwords = []
indicators = np.zeros(len(dico))
for i in range(15000):
    User = train.loc[train.newuser_id == i+1].sort_values('newbookid')
    User['binary']= [1 if x >=4 else 0 for x in User['rating']]
    A[User['newbookid']-1,:] 
    model = fit_naive_bayes_model(A[User['newbookid']-1,:], User['binary'])
    result = predict_from_naive_bayes_model(model, A)
    UserRes = finalbooks.filter(['newbookid'])
    UserRes['newuser_id'] = i+1 
    UserRes['pred'] = result
    allpreds.append(UserRes)
    #top5 = get_top_five_naive_bayes_words(model, dico)
    #topwords.append(top5)
    indicators = indicators + (model['logtheta_0'] - model['logtheta_1'])
    if (i+1)%1000 == 0: print("done: ", i+1)
## Append in a list and then use concat
## get_top_five_naive_bayes_words(model, dico)

In [0]:
indicators = indicators*15000
ids = np.argsort(-indicators)[:5]
reverse_dictionary = {i: word for word, i in dico.items()}
[reverse_dictionary[i] for i in ids]

In [0]:
np.sort(indicators*1000000000000)[:5]

In [0]:
fivewords = np.concatenate(topwords, axis=0 )
from collections import Counter
for key, value in sorted(Counter(fivewords).items(), reverse=True, key=lambda item: item[1]):
    print("%s: %s" % (key, value))


#Bayes =pd.DataFrame(predictions, columns=['newbookid', 'newuser_id', 'pred']) 

In [0]:
predictions = np.concatenate(allpreds, axis=0 )

In [0]:
bayes =pd.DataFrame(predictions, columns=['newbookid','newuser_id', 'pred']) 
bayes

In [0]:
bayesrank = test.merge(bayes,on = ['newbookid', 'newuser_id'])
bayesrank = bayesrank.sort_values(by=['newuser_id', 'pred'], ascending=False)
bayesrank.head(5)

In [0]:
bayesrank['pred']=bayesrank['pred']*4+1
bayesrank.head(5)

In [0]:
bayesrank['pred'].min()

In [0]:
train['conc']=train['newuser_id'].map(str)+train['newbookid'].map(str)
bayes['conc']=bayes['newuser_id'].map(str)+bayes['newbookid'].map(str)

In [0]:
bayesfin = bayes[~bayes.conc.isin(train.conc)]
bayesfin.describe()

In [0]:
bayeslist = []
for i in range(15000):
    a = bayesrank.loc[bayesrank.newuser_id == i+1]['rating'].tolist()
    bayeslist.append(a)
    if (i+1)%1000 == 0: print("done: ", i+1)

In [0]:
b = np.array([ndcg_k(r, len(r)) for r in bayeslist])

facet, axes = plt.subplots(1, 1, figsize=(10, 3))
n, bins, patches = plt.hist(b, 200, facecolor='blue', alpha=0.5) #, log = True)   
plt.title('Distribution of NDGC among Users for the Bayes model')
plt.show()

# [ndcg_k(r, len(r)) for r in poplista]

In [0]:
d = b[b == 1]
sum(d)/15000

In [0]:
#top10 = bayesfin.sort_values('pred',ascending = False).groupby('newuser_id').head(10)
#top50 = bayesfin.sort_values('pred',ascending = False).groupby('newuser_id').head(50)

print('(1) Bayes Model RMSE: ', np.round(rmse(bayesrank['pred'],bayesrank['rating']), decimals=3))
print('(2) Bayes Model NDCG: ', np.round(mean_ndcg(bayeslist), decimals=3))
print("(2) Median NDCG: ", np.round(np.median(b), decimals=3))
print("(2) Share of NDCG =1 among Users: ", np.round(sum(d)/15000, decimals=3))
#print('(3) Bayes Model Div10 Score: ',np.round(sum(np.in1d(top10.newbookid, tail.newbookid))/len(top10), decimals=3))
#print('(3) Bayes Model Div50 Score: ',np.round(sum(np.in1d(top50.newbookid, tail.newbookid))/len(top50), decimals=3))

In [0]:
bayesranktrain = train.merge(bayes,on = ['newbookid', 'newuser_id'])
bayesranktrain = bayesranktrain.sort_values(by=['newuser_id', 'pred'], ascending=False)
bayesranktrain['pred']=bayesranktrain['pred']*4+1


In [0]:
bayeslisttrain = []
for i in range(15000):
    a = bayesranktrain.loc[bayesranktrain.newuser_id == i+1]['rating'].tolist()
    bayeslisttrain.append(a)
    if (i+1)%1000 == 0: print("done: ", i+1)

In [0]:


# print('Popularity Model MAP: ', mean_average_precision(poplistb))
print('(1) Bayes Model Train RMSE: ', np.round(rmse(bayesranktrain['pred'],bayesranktrain['rating']), decimals=3))
print('(2) Bayes Model Train NDCG: ', np.round(mean_ndcg(bayeslisttrain), decimals=3))
