In [2]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy import sparse
# import jovian

In [3]:
anime_ratings_df = pd.read_csv("rating.csv")
anime_ratings_df.shape
print(anime_ratings_df.head())

   user_id  anime_id  rating
0        1        20      -1
1        1        24      -1
2        1        79      -1
3        1       226      -1
4        1       241      -1


In [4]:
anime_ratings = anime_ratings_df.loc[anime_ratings_df.rating != -1].reset_index()[['user_id','anime_id','rating']]
print(anime_ratings.shape)
anime_ratings.head()

(68011, 3)


Unnamed: 0,user_id,anime_id,rating
0,1,8074,10
1,1,11617,10
2,1,11757,10
3,1,15451,10
4,2,11771,10


In [5]:
Counter(anime_ratings.rating)

Counter({10: 9797,
         8: 17917,
         6: 6236,
         9: 14468,
         7: 14973,
         3: 492,
         5: 2592,
         4: 1044,
         1: 214,
         2: 278})

In [6]:
Counter(anime_ratings.groupby(['user_id']).count()['anime_id'])

Counter({4: 14,
         1: 43,
         92: 3,
         459: 1,
         343: 1,
         12: 12,
         3: 23,
         110: 4,
         22: 7,
         123: 2,
         8: 14,
         407: 1,
         23: 12,
         21: 12,
         19: 9,
         180: 4,
         72: 2,
         52: 5,
         38: 9,
         79: 1,
         11: 14,
         129: 4,
         35: 5,
         76: 6,
         17: 15,
         18: 11,
         5: 13,
         68: 4,
         253: 2,
         51: 3,
         54: 8,
         85: 2,
         6: 11,
         280: 1,
         112: 4,
         26: 5,
         334: 1,
         34: 7,
         90: 7,
         116: 2,
         60: 3,
         40: 6,
         32: 8,
         16: 11,
         30: 4,
         7: 10,
         42: 4,
         96: 4,
         20: 8,
         25: 6,
         57: 5,
         2: 27,
         37: 7,
         73: 1,
         173: 1,
         94: 5,
         171: 2,
         108: 2,
         164: 2,
         10: 7,
         63: 3,
 

In [7]:
#Average number of ratings per user
np.mean(anime_ratings.groupby(['user_id']).count()['anime_id'])

83.04151404151405

In [8]:
train_df, valid_df = train_test_split(anime_ratings, test_size=0.2)

#resetting indices to avoid indexing errors in the future
train_df = train_df.reset_index()[['user_id', 'anime_id', 'rating']]
valid_df = valid_df.reset_index()[['user_id', 'anime_id', 'rating']]

In [9]:
def encode_column(column):
    """ Encodes a pandas column with continous IDs"""
    keys = column.unique()
    key_to_id = {key:idx for idx,key in enumerate(keys)}
    return key_to_id, np.array([key_to_id[x] for x in column]), len(keys)

In [10]:
def encode_df(anime_df):
    """Encodes rating data with continuous user and anime ids"""

    anime_ids, anime_df['anime_id'], num_anime = encode_column(anime_df['anime_id'])
    user_ids, anime_df['user_id'], num_users = encode_column(anime_df['user_id'])
    return anime_df, num_users, num_anime, user_ids, anime_ids

In [11]:
anime_df, num_users, num_anime, user_ids, anime_ids = encode_df(train_df)
print("Number of users :", num_users)
print("Number of anime :", num_anime)
anime_df.head()

Number of users : 811
Number of anime : 4079


Unnamed: 0,user_id,anime_id,rating
0,0,0,8
1,1,1,8
2,2,2,9
3,3,3,8
4,4,4,9


In [12]:
def create_embeddings(n, K):
    """
    Creates a random numpy matrix of shape n, K with uniform values in (0, 11/K)
    n: number of items/users
    K: number of factors in the embedding
    """
    return 11*np.random.random((n, K)) / K

In [13]:
def create_sparse_matrix(df, rows, cols, column_name="rating"):
    """ Returns a sparse utility matrix"""
    return sparse.csc_matrix((df[column_name].values,(df['user_id'].values, df['anime_id'].values)),shape=(rows, cols))

In [14]:
anime_df, num_users, num_anime, user_ids, anime_ids = encode_df(train_df)
Y = create_sparse_matrix(anime_df, num_users, num_anime)

In [15]:
# to view matrix
Y.todense()

matrix([[ 8,  0, 10, ...,  0,  0,  0],
        [ 0,  8,  9, ...,  0,  0,  0],
        [ 5,  0,  9, ...,  0,  0,  0],
        ...,
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0]])

In [16]:
def predict(df, emb_user, emb_anime):
    """ This function computes df["prediction"] without doing (U*V^T).

    Computes df["prediction"] by using elementwise multiplication of the corresponding embeddings and then
    sum to get the prediction u_i*v_j. This avoids creating the dense matrix U*V^T.
    """
    df['prediction'] = np.sum(np.multiply(emb_anime[df['anime_id']],emb_user[df['user_id']]), axis=1)
    return df

In [17]:
lmbda = 0.0002

In [18]:
def cost(df, emb_user, emb_anime):
    """ Computes mean square error"""
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_anime.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_anime), emb_user.shape[0], emb_anime.shape[0], 'prediction')
    return np.sum((Y-predicted).power(2))/df.shape[0]

In [19]:
def gradient(df, emb_user, emb_anime):
    """ Computes the gradient for user and anime embeddings"""
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_anime.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_anime), emb_user.shape[0], emb_anime.shape[0], 'prediction')
    delta =(Y-predicted)
    grad_user = (-2/df.shape[0])*(delta*emb_anime) + 2*lmbda*emb_user
    grad_anime = (-2/df.shape[0])*(delta.T*emb_user) + 2*lmbda*emb_anime
    return grad_user, grad_anime

In [20]:
def gradient_descent(df, emb_user, emb_anime, iterations=2000, learning_rate=0.01, df_val=None):
    """
    Computes gradient descent with momentum (0.9) for given number of iterations.
    emb_user: the trained user embedding
    emb_anime: the trained anime embedding
    """
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_anime.shape[0])
    beta = 0.9
    grad_user, grad_anime = gradient(df, emb_user, emb_anime)
    v_user = grad_user
    v_anime = grad_anime
    for i in range(iterations):
        grad_user, grad_anime = gradient(df, emb_user, emb_anime)
        v_user = beta*v_user + (1-beta)*grad_user
        v_anime = beta*v_anime + (1-beta)*grad_anime
        emb_user = emb_user - learning_rate*v_user
        emb_anime = emb_anime - learning_rate*v_anime
        if(not (i+1)%50):
            print("\niteration", i+1, ":")
            print("train mse:",  cost(df, emb_user, emb_anime))
            if df_val is not None:
                print("validation mse:",  cost(df_val, emb_user, emb_anime))
    return emb_user, emb_anime

In [21]:
emb_user = create_embeddings(num_users, 3)
emb_anime = create_embeddings(num_anime, 3)
emb_user, emb_anime = gradient_descent(anime_df, emb_user, emb_anime, iterations=800, learning_rate=1)


iteration 50 :
train mse: 5.898050262102756

iteration 100 :
train mse: 3.9124539990842364

iteration 150 :
train mse: 3.0960872560765758

iteration 200 :
train mse: 2.645482241096588

iteration 250 :
train mse: 2.3648589619905573

iteration 300 :
train mse: 2.1765671406278746

iteration 350 :
train mse: 2.043649058830469

iteration 400 :
train mse: 1.9463552066532592

iteration 450 :
train mse: 1.873219942211898

iteration 500 :
train mse: 1.8171546517156372

iteration 550 :
train mse: 1.7735545206890628

iteration 600 :
train mse: 1.739306378309572

iteration 650 :
train mse: 1.7122350329346354

iteration 700 :
train mse: 1.6907779487913286

iteration 750 :
train mse: 1.6737856889437004

iteration 800 :
train mse: 1.66039497883991


In [22]:
def encode_new_data(valid_df, user_ids, anime_ids):
    """ Encodes valid_df with the same encoding as train_df.
    """
    df_val_chosen = valid_df['anime_id'].isin(anime_ids.keys()) & valid_df['user_id'].isin(user_ids.keys())
    valid_df = valid_df[df_val_chosen]
    valid_df['anime_id'] =  np.array([anime_ids[x] for x in valid_df['anime_id']])
    valid_df['user_id'] = np.array([user_ids[x] for x in valid_df['user_id']])
    return valid_df

In [23]:
print("before encoding:", valid_df.shape)
valid_df = encode_new_data(valid_df, user_ids, anime_ids)
print("after encoding:", valid_df.shape)

before encoding: (13603, 3)
after encoding: (3989, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df['anime_id'] =  np.array([anime_ids[x] for x in valid_df['anime_id']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df['user_id'] = np.array([user_ids[x] for x in valid_df['user_id']])


In [24]:
train_mse = cost(train_df, emb_user, emb_anime)
val_mse = cost(valid_df, emb_user, emb_anime)
print(train_mse, val_mse)

1.66039497883991 6.117813048501479


In [25]:
#looking at the predictions
valid_df[70:80].head()

Unnamed: 0,user_id,anime_id,rating,prediction
252,627,2508,8,3.762509
256,392,1913,7,6.566288
257,500,196,7,6.529525
259,288,578,5,6.229437
261,312,20,9,7.206924
