In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import csv
import pickle

from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, Lambda, Dot
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import Adam

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from collections import defaultdict

In [2]:
def loadData():
    itemTrain = np.genfromtxt('./data/content_item_train.csv', delimiter = ',')
    userTrain = np.genfromtxt('./data/content_user_train.csv', delimiter = ',')
    yTrain = np.genfromtxt('./data/content_y_train.csv', delimiter = ',')
   
    itemList = np.genfromtxt('./data/content_item_vecs.csv', delimiter = ',')
    userList = np.unique(userTrain, axis = 0)
    
    with open('./data/content_user_to_genre.pickle', 'rb') as f:
        userToGenre = pickle.load(f)

    movieInfos = defaultdict(dict)
    count = 0
    with open('./data/content_movie_list.csv', newline = '') as csvfile:
        reader = csv.reader(csvfile, delimiter = ',', quotechar = '"')
        for line in reader:
            if count == 0: 
                count += 1  
            else:
                count += 1
                movieID = int(line[0])  
                movieInfos[movieID]["title"] = line[1]  
                movieInfos[movieID]["genres"] = line[2]  
    
    return (itemTrain, userTrain, yTrain, itemList, userList, userToGenre, movieInfos)

In [3]:
itemTrain, userTrain, yTrain, itemList, userList, userToGenre, movieInfos = loadData()

userFeatureStartIdx = 3 
itemFeatureStartIdx = 1
userFeatureCount = userTrain.shape[1] - userFeatureStartIdx 
itemFeatureCount = itemTrain.shape[1] - itemFeatureStartIdx 

print(f"itemTrain shape: {itemTrain.shape}")
print(f"itemTrain: {itemTrain[:5]}")
print(f"userTrain shape: {userTrain.shape}")
print(f"userTrain: {userTrain[:5]}")
print(f"yTrain shape: {yTrain.shape}")
print(f"itemList shape: {itemList.shape}")
print(f"userList shape: {userList.shape}")
print(f"userFeatureCount: {userFeatureCount}")
print(f"itemFeatureCount: {itemFeatureCount}")

itemTrain shape: (58187, 17)
itemTrain: [[6.87400000e+03 2.00300000e+03 3.96183206e+00 1.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00]
 [6.87400000e+03 2.00300000e+03 3.96183206e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00]
 [6.87400000e+03 2.00300000e+03 3.96183206e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  1.00000000e+00]
 [8.79800000e+03 2.00400000e+03 3.76136364e+00 1.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.0000

In [4]:
rawUserTrain = userTrain

scalerItem = StandardScaler()
scalerItem.fit(itemTrain)
itemTrain = scalerItem.transform(itemTrain)

scalerUser = StandardScaler()
scalerUser.fit(userTrain)
userTrain = scalerUser.transform(userTrain)

scalerTarget = MinMaxScaler((-1, 1))
yTrain = yTrain.reshape(-1, 1)
scalerTarget.fit(yTrain)
yTrain = scalerTarget.transform(yTrain)

itemTrain, itemValid = train_test_split(itemTrain, train_size = 0.80, shuffle = True, random_state = 1)
userTrain, userValid = train_test_split(userTrain, train_size = 0.80, shuffle = True, random_state = 1)
yTrain, yValid = train_test_split(yTrain, train_size = 0.80, shuffle = True, random_state = 1)

In [5]:
outputSize = 32
tf.random.set_seed(1)
userNN = Sequential([
    Dense(256, activation = "relu"),
    Dense(128, activation = "relu"),
    Dense(outputSize),
    Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis = 1)),
])

itemNN = Sequential([
    Dense(256, activation = "relu"),
    Dense(128, activation = "relu"),
    Dense(outputSize),
    Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis = 1)),
])

userNNInput = Input(shape = (userFeatureCount, ))
vUser = userNN(userNNInput)

itemNNInput = Input(shape = (itemFeatureCount, ))
vItem = itemNN(itemNNInput)

output = Dot(axes = 1)([vUser, vItem])

model = tf.keras.Model([userNNInput, itemNNInput], output)
model.compile(optimizer = Adam(0.01),
              loss = MeanSquaredError())
xTrain = [userTrain[:, userFeatureStartIdx:], itemTrain[:, itemFeatureStartIdx:]]
model.fit(xTrain, yTrain, epochs = 30)
print(f"evaluate:")
model.evaluate([userValid[:, userFeatureStartIdx:], itemValid[:, itemFeatureStartIdx:]], yValid)


Epoch 1/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 832us/step - loss: 0.1306
Epoch 2/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 862us/step - loss: 0.1173
Epoch 3/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 841us/step - loss: 0.1151
Epoch 4/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 825us/step - loss: 0.1137
Epoch 5/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 811us/step - loss: 0.1126
Epoch 6/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 890us/step - loss: 0.1116
Epoch 7/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 850us/step - loss: 0.1108
Epoch 8/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 819us/step - loss: 0.1100
Epoch 9/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 839us/step - loss: 0.1094
Epoch 10/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━

0.10718576610088348

注意: 評分範圍是0.5 ~ 5，以0.5為單位。

In [6]:
newUserID = 5000
newRatingAve = 1.0
newAction = 1.0
newAdventure = 5.0 #
newAnimation = 1.0
newChildrens = 1.0
newComedy = 1.0
newCrime = 1.0
newDocumentary = 1.0
newDrama = 1.0
newFantasy = 5.0 #
newHorror = 1.0
newMystery = 1.0
newRomance = 1.0
newScifi = 1.0
newThriller = 1.0
newRatingCount = 3

newUser = np.array([[newUserID, newRatingCount, newRatingAve,
                      newAction, newAdventure, newAnimation, newChildrens,
                      newComedy, newCrime, newDocumentary,
                      newDrama, newFantasy, newHorror, newMystery,
                      newRomance, newScifi, newThriller]])

In [7]:
newUserPeritems = np.tile(newUser, (len(itemList), 1))
newUserPeritems = scalerUser.transform(newUserPeritems)

items = scalerItem.transform(itemList)

pred = model.predict([newUserPeritems[:, userFeatureStartIdx:], items[:, itemFeatureStartIdx:]])
pred = scalerTarget.inverse_transform(pred)

sortedIndex = np.argsort(-pred, axis = 0).reshape(-1).tolist()
sortedPreds = pred[sortedIndex]
sortedItems = itemList[sortedIndex]

recommendedList = np.c_[sortedPreds, sortedItems[:, 0]][:5]
print(f"recommended top {len(recommendedList)} movies:")
for i in range(len(recommendedList)):
    itemID = recommendedList[i, 1]
    print(f"rating: {recommendedList[i, 0]:0.2f}, itemID: {itemID:0.0f}, name: {movieInfos[itemID]['title']}, label: {movieInfos[itemID]['genres']}")

[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
recommended top 5 movies:
rating: 4.46, itemID: 5952, name: Lord of the Rings: The Two Towers, The (2002), label: Adventure|Fantasy
rating: 4.43, itemID: 4993, name: Lord of the Rings: The Fellowship of the Ring, The (2001), label: Adventure|Fantasy
rating: 4.41, itemID: 4993, name: Lord of the Rings: The Fellowship of the Ring, The (2001), label: Adventure|Fantasy
rating: 4.41, itemID: 6016, name: City of God (Cidade de Deus) (2002), label: Action|Adventure|Crime|Drama|Thriller
rating: 4.40, itemID: 7153, name: Lord of the Rings: The Return of the King, The (2003), label: Action|Adventure|Drama|Fantasy


In [8]:
def getRating(userID):
    itemCount = len(itemList)
    ratingList = np.zeros(itemCount)
    for i in range(itemCount): 
        itemID = itemList[i, 0]
        if itemID in userToGenre[userID]['movies']:
            rating = userToGenre[userID]['movies'][itemID]
        else:
            rating = 0
        ratingList[i] = rating
    return ratingList
    
user = userList[0]
userPeritems = np.tile(user, (len(itemList), 1))
userPeritems = scalerUser.transform(userPeritems)

items = scalerItem.transform(itemList)

ratingList = getRating(user[0])

pred = model.predict([userPeritems[:, userFeatureStartIdx:], items[:, itemFeatureStartIdx:]])
pred = scalerTarget.inverse_transform(pred)

sortedIndex = np.argsort(-pred, axis = 0).reshape(-1).tolist() 
sortedPred = pred[sortedIndex]
sortedItems = itemList[sortedIndex]
sortedUser = userPeritems[sortedIndex]
sortedRatingList = ratingList[sortedIndex]

recommendedList = np.c_[sortedRatingList, sortedPreds, sortedItems[:, 0]][:5]
print(f"recommended top {len(recommendedList)} movies:")
for i in range(len(recommendedList)):
    itemID = recommendedList[i, 2]
    print(f"rating: {recommendedList[i, 0]:0.2f}, pred rating: {recommendedList[i, 1]:0.2f}, itemID: {itemID:0.0f}, name: {movieInfos[itemID]['title']}, label: {movieInfos[itemID]['genres']}")

[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 716us/step
recommended top 5 movies:
rating: 0.00, pred rating: 4.46, itemID: 168252, name: Logan (2017), label: Action|Sci-Fi
rating: 0.00, pred rating: 4.43, itemID: 122926, name: Untitled Spider-Man Reboot (2017), label: Action|Adventure|Fantasy
rating: 0.00, pred rating: 4.41, itemID: 114935, name: Predestination (2014), label: Action|Mystery|Sci-Fi|Thriller
rating: 0.00, pred rating: 4.41, itemID: 122926, name: Untitled Spider-Man Reboot (2017), label: Action|Adventure|Fantasy
rating: 0.00, pred rating: 4.40, itemID: 122916, name: Thor: Ragnarok (2017), label: Action|Adventure|Sci-Fi


In [9]:
itemNNInput = Input(shape = (itemFeatureCount, ))  
itemNN = itemNN(itemNNInput)                                       
itemNN = Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis = 1))(itemNN)
itemNN = tf.keras.Model(itemNNInput, itemNN)                                
itemNN.summary()

In [10]:
items = scalerItem.transform(itemList)
itemVecs = itemNN.predict(items[:, itemFeatureStartIdx:])
print(f"size of all predicted movie feature vectors: {itemVecs.shape}")

[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
size of all predicted movie feature vectors: (1883, 32)


In [11]:
itemCount = len(items)
disMatrix = np.sqrt(np.sum((itemVecs[:, np.newaxis, :] - itemVecs[np.newaxis, :, :]) ** 2, axis = 2))
maskedDisMatrix = np.ma.masked_array(disMatrix, mask = np.identity(itemCount)) 

for i in range(10):
    idx = np.argmin(maskedDisMatrix[i])
    movie1 = int(itemList[i, 0])
    movie2 = int(itemList[idx, 0])
    movieStr1 = f"{movie1}[{movieInfos[movie1]['genres']}]"
    movieStr2 = f"{movie2}[{movieInfos[movie2]['genres']}]"
    print(f"{movieStr1} is similar to {movieStr2}")

4054[Drama|Romance] is similar to 5108[Crime|Drama|Thriller]
4054[Drama|Romance] is similar to 5296[Comedy|Romance]
4069[Comedy|Romance] is similar to 5449[Comedy|Romance]
4069[Comedy|Romance] is similar to 5449[Comedy|Romance]
4148[Horror|Thriller] is similar to 6058[Horror|Thriller]
4148[Horror|Thriller] is similar to 5400[Drama|Thriller]
4149[Comedy|Romance] is similar to 4386[Children|Comedy]
4149[Comedy|Romance] is similar to 4054[Drama|Romance]
4153[Comedy|Fantasy|Romance] is similar to 4247[Adventure|Comedy|Mystery|Romance]
4153[Comedy|Fantasy|Romance] is similar to 6958[Children|Comedy|Fantasy|Horror]
