<a href="https://colab.research.google.com/github/althafnewdelhi/CE888_DataScience/blob/master/Lab_5/Recommender_Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from IPython.display import Image
np.set_printoptions(precision = 3)

Loading the data

In [0]:
user_ratings_df = pd.read_csv("https://raw.githubusercontent.com/albanda/CE888/master/lab5-recommender/user_ratings.csv")
user_features_df = pd.read_csv("https://raw.githubusercontent.com/albanda/CE888/master/lab5-recommender/user_features.csv")
item_features_df = pd.read_csv("https://raw.githubusercontent.com/albanda/CE888/master/lab5-recommender/item_features.csv")

In [0]:
user_features_df["key"] = 0
user_features_df["user_id"] = range(user_features_df.shape[0])
item_features_df["key"] = 0
item_features_df["item_id"] = range(item_features_df.shape[0])

merged_df = pd.merge(user_features_df, item_features_df, left_index=True, on="key")
merged_df.drop(labels='key', axis=1, inplace=True)  # drop the "key" column

In [4]:
merged_df[["item_id", "user_id"]]

Unnamed: 0,item_id,user_id
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


In [0]:
merged_df["rating"] = map(lambda ids: user_ratings_df.values[ids[1]][ids[2]], 
                          merged_df[["user_id", "item_id"]].itertuples())

train = merged_df.dropna()
test = merged_df[merged_df.isnull().any(axis=1)]

In [6]:
n_latent_factors = 2

user_ratings = user_ratings_df.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_latent_factors))
latent_item_features = np.random.random((user_ratings.shape[1], n_latent_factors))

user_features = user_features_df.values
item_features = item_features_df.values

user_features = np.concatenate([np.ones(shape=(user_features.shape[0],1)), user_features], axis=1)
item_features = np.concatenate([np.ones(shape=(item_features.shape[0],1)), item_features], axis=1)

user_features_weights = np.random.random((user_ratings.shape[0], user_features.shape[1]))
item_features_weights = np.random.random((user_ratings.shape[1], item_features.shape[1]))

print(user_features)

[[1. 1. 0. 0. 0.]
 [1. 0. 1. 0. 1.]
 [1. 0. 0. 0. 2.]
 [1. 1. 0. 0. 3.]
 [1. 0. 1. 0. 4.]
 [1. 0. 0. 0. 5.]
 [1. 0. 0. 0. 6.]
 [1. 1. 0. 0. 7.]
 [1. 0. 1. 0. 8.]
 [1. 1. 0. 0. 9.]]


In [0]:
def predict_rating(user_id, item_id):
    """
    Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    
    user_score = user_features_weights[user_id].dot(user_features[user_id])
    item_score = item_features_weights[item_id].dot(item_features[item_id])
    #print(user_preference.dot(item_preference), user_score, item_score)
    return user_preference.dot(item_preference) + user_score + item_score


def train(user_id, item_id, rating, alpha=0.001, 
          latent_feature_weight_decay=0.1,
          user_weight_decay=0.01, item_weight_decay=0.0001):
    #print(item_id)
    prediction_rating = predict_rating(user_id, item_id)
    err = prediction_rating - rating
    #print(err)
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err * (latent_item_features[item_id] + latent_feature_weight_decay * latent_user_preferences[user_id])
    latent_item_features[item_id] -= alpha * err * (user_pref_values + latent_feature_weight_decay * latent_item_features[item_id])
    
    user_features_weights[user_id] -=alpha * err * (user_features[user_id] + user_weight_decay * user_features_weights[user_id])
    item_features_weights[item_id] -=alpha * err * (item_features_weights[item_id] + item_weight_decay * item_features_weights[item_id])
    
    return err


def sgd(iterations=30000):
    """ 
    Iterate over all users and all items and train for 
    a certain number of iterations
    """
    for iteration in range(iterations):
        error = []
        for user_id in range(latent_user_preferences.shape[0]):
            for item_id in range(latent_item_features.shape[0]):
                rating = user_ratings[user_id][item_id]
                if not np.isnan(rating):
                    err = train(user_id, item_id, rating)
                    error.append(err)
    mse = (np.array(error) ** 2).mean()          
    print(mse)

In [8]:
for _ in range(10): 
    sgd()  # Note decreasing values with increasing iterations

0.290712447735076
0.28383143187645393
0.28133201051426593
0.28006368270836285
0.2793056716214882
0.2788104357451336
0.2784703077154947
0.27823094570842044
0.27806202338998415
0.27794538514601796


In [9]:
predictions = np.zeros(shape=(latent_user_preferences.shape[0], latent_item_features.shape[0]))
print(user_features_weights)
print(item_features_weights)
for user_id in range(latent_user_preferences.shape[0]):
    for item_id in range(latent_item_features.shape[0]):
        predictions[user_id, item_id] =  predict_rating(user_id, item_id)

[[-1.091 -1.231  0.358  0.196  0.881]
 [ 1.613  0.684  2.072  0.444  1.856]
 [ 0.785  0.138  0.748  0.274  0.448]
 [ 0.444  0.379  0.068  0.297  0.755]
 [ 0.148  0.966  0.63   0.365 -0.257]
 [ 0.839  0.726  0.931  0.09   1.379]
 [ 0.162  0.278  0.58   0.028 -0.227]
 [ 0.269  0.833  0.362  0.414  0.029]
 [ 0.861  0.481  0.049  0.628  0.534]
 [ 0.522 -0.077  0.617  0.228 -0.271]]
[[1.186e+00 1.385e+00 1.140e+00 4.188e-01 2.927e+00]
 [2.563e-04 3.109e-04 3.821e-04 3.252e-04 9.685e-04]
 [4.757e-01 2.517e+00 2.114e+00 2.040e-02 2.467e+00]
 [1.275e+00 1.826e+00 1.130e+00 3.336e+00 1.997e-01]
 [2.067e+00 1.355e+00 1.369e+00 9.316e-01 1.362e+00]]


In [0]:
values = [zip(user_ratings[i], predictions[i]) for i in range(predictions.shape[0])]
comparison_data = pd.DataFrame(values)
comparison_data.columns = user_ratings_df.columns

In [11]:
comparison_data

Unnamed: 0,The Call of Cthulhu,Frankenstein,Dracula,Neuromancer,Space Odyssey
0,"(8.0, 7.988908226747817)","(2.0, 2.0340427939297836)","(nan, -21.881921637225513)","(5.0, 4.9833651547582996)","(4.0, 3.9957344517544833)"
1,"(3.0, 2.8990757432924523)","(2.0, 2.5590489485248304)","(nan, 67.33779075311224)","(7.0, 6.619552916101618)","(7.0, 6.924911544901491)"
2,"(9.0, 9.043341542842018)","(nan, 4.367090949333944)","(7.0, 7.003073178777338)","(8.0, 7.951778254244229)","(5.0, 5.000722924795685)"
3,"(nan, 8.966589663920232)","(nan, 4.931937972130505)","(7.0, 7.00000434292866)","(8.0, 7.999627049326043)","(9.0, 8.999357894667648)"
4,"(nan, 3.8201697033120228)","(1.0, 0.5702668742791497)","(8.0, 7.988421594979183)","(3.0, 3.369305099903359)","(7.0, 7.053448647053955)"
5,"(2.0, 2.0018172850433213)","(3.0, 2.994870006618253)","(5.0, 5.0000348956172544)","(nan, 2.2752324540607862)","(nan, 32.67519304597736)"
6,"(4.0, 4.310591936154437)","(2.0, 0.4359229843971182)","(nan, -3.887293128359315)","(2.0, 2.9986694964275578)","(7.0, 7.204207209396689)"
7,"(7.0, 6.758952898398799)","(1.0, 2.9076046770490716)","(2.0, 2.086514166223968)","(7.0, 5.66351660894003)","(9.0, 8.760921889097965)"
8,"(3.0, 3.1022792763066604)","(3.0, 2.5190364790358073)","(nan, 76.05856719486854)","(7.0, 7.28968723460072)","(3.0, 3.060940620011933)"
9,"(4.0, 3.8925759081493356)","(nan, -0.14613611749631428)","(5.0, 4.98404054607164)","(3.0, 3.1088489387956093)","(3.0, 2.9968036951828303)"
