<a href="https://colab.research.google.com/github/bukhtiarhaider/CE888/blob/main/my_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=3)

In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/albanda/CE888/master/lab4-recommender/jester-data-1.csv', header=None)
data
data.replace(99, np.nan, inplace=True)
data.drop(data.columns[0], axis=1, inplace=True)
data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,-7.82,8.79,-9.66,-8.16,-7.52,-8.50,-9.85,4.17,-8.98,-4.76,...,2.82,,,,,,-5.63,,,
1,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,9.22,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,,,,,9.03,9.27,9.03,9.27,,,...,,,,9.08,,,,,,
3,,8.35,,,1.80,8.16,-2.82,6.21,,1.84,...,,,,0.53,,,,,,
4,8.50,4.61,-4.17,-5.39,1.36,1.60,7.04,4.61,-0.44,5.73,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.80,1.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24978,0.44,7.43,9.08,2.33,3.20,6.75,-8.79,-0.53,-8.74,7.23,...,8.83,-1.21,9.22,-6.70,8.45,9.03,6.55,8.69,8.79,7.43
24979,9.13,-8.16,8.59,9.08,0.87,-8.93,-3.50,5.78,-8.11,4.90,...,-1.17,-5.73,-1.46,0.24,9.22,-8.20,-7.23,-8.59,9.13,8.45
24980,,,,,-7.77,,6.70,-6.75,,,...,,,,,,,,,,
24981,,,,,-9.71,,4.56,-8.30,,,...,,,,,,,,,,


In [4]:
averages = data.mean(axis=0)
index_best = averages.idxmax()
index_worst = averages.idxmin()
print('Best rated index: ', index_best, '| Best rated value: ', averages[index_best])
print('Worst rated index: ', index_worst, '| Worst rated value: ', averages[index_worst])

Best rated index:  50 | Best rated value:  3.6650848950824937
Worst rated index:  58 | Worst rated value:  -3.8338796373689923


In [5]:
def replace(orig, percentage=0.1):
  """
  Replaces 'percentage'% of the original values in 'orig' with 99's
  :param orig: original data array
  :param percentage: percentage of values to replace (0<percentage<1)
  """
  new_data = orig.copy()
  rated = np.where(~np.isnan(orig))
  n_rated = len(rated[0])
  idx = np.random.choice(n_rated, size=int(percentage*n_rated), replace=False)
  for val in idx:
    new_data.iloc[rated[0][val]][rated[1][val]] = 99
  #new_data.iloc[rated[0][idx], rated[1][idx]] = 99
  return new_data, (rated[0][idx], rated[1][idx])

In [6]:
new_data, var = replace(data)

In [7]:
n_latent_factors = 2

# Initialise as random values
latent_user_preferences = np.random.random((new_data.shape[0], n_latent_factors))
latent_item_features = np.random.random((new_data.shape[1], n_latent_factors))

In [8]:
def predict_rating(user_id, item_id):
    """ Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    return user_preference.dot(item_preference)


def train(user_id, item_id, rating, alpha=0.0001):
    #print(item_id)
    predicted_rating = predict_rating(user_id, item_id)
    err =  predicted_rating - rating
    #print(err)
    user_pref_values = latent_user_preferences[user_id]
    latent_user_preferences[user_id] -= alpha * err * latent_item_features[item_id]
    latent_item_features[item_id] -= alpha * err * user_pref_values
    return err
    

def sgd(iterations):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    mse_history = []
    for iteration in range(iterations):
        error = []
        for user_id in range(latent_user_preferences.shape[0]):
            for item_id in range(latent_item_features.shape[0]):
                rating = new_data.iloc[user_id, item_id]
                if not np.isnan(rating):
                    err = train(user_id, item_id, rating)
                    error.append(err)
        mse = (np.array(error) ** 2).mean()   
        if (iteration % 10000) == 0:
            print('Iteration %d/%d:\tMSE=%.6f' % (iteration, iterations, mse))
            mse_history.append(mse)
    return mse_history

In [9]:
num_iter = 10
hist = sgd(num_iter)  # Note how the MSE decreases with the number of iterations
plt.figure()
plt.plot(np.arange(0, num_iter, 10000), hist)
plt.xlabel("Iterations")
plt.ylabel("MSE")
plt.show()

Iteration 0/10:	MSE=911.861005


KeyboardInterrupt: ignored

In [13]:
user_preference = [-0.5093567559,-0.05233160927,-1.316709218,-0.4381727588,-0.001224580749,-0.3063774528,0.008600096167,-0.3263340741,-0.3456246464,0.2826051983,0.294095387,0.5321319435,0.3766879957,0.274314491,-0.003782486973]
item_preference = [-0.3203154137,0.07828959951,-0.08318190639,-0.2215772631,-0.07818350868,-0.1453058228,-0.0201658816,0.1244092822,-0.06342513842,0.004725659299,0.009683469369,0.02801552954,-0.01809952776,-0.1284522696,0.0699720274]

In [17]:
a = np.array(user_preference)
b = np.array(item_preference)
a.dot(b)

0.3682088029849904

In [20]:
data.iloc[1615, 65]

7.43