In [515]:
from scipy.io import loadmat
import numpy as np


# Завантажимо необхідні датасети

data = loadmat("hw7.2_data/movies.mat")
Y, R = data['Y'], data['R']
Y.shape, R.shape


def loadMovieList():
    """
    Reads the fixed movie list in movie_ids.txt and returns a list of movie names.
    Returns
    -------
    movieNames : list
        A list of strings, representing all movie names.
    """
    # Read the fixed movieulary list
    with open('hw7.2_data/movie_ids.txt',  encoding='ISO-8859-1') as fid:
        movies = fid.readlines()

    movieNames = []
    for movie in movies:
        parts = movie.split()
        movieNames.append(' '.join(parts[1:]).strip())
    return movieNames

movieList = loadMovieList()
movieList


['Toy Story (1995)',
 'GoldenEye (1995)',
 'Four Rooms (1995)',
 'Get Shorty (1995)',
 'Copycat (1995)',
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 'Twelve Monkeys (1995)',
 'Babe (1995)',
 'Dead Man Walking (1995)',
 'Richard III (1995)',
 'Seven (Se7en) (1995)',
 'Usual Suspects, The (1995)',
 'Mighty Aphrodite (1995)',
 'Postino, Il (1994)',
 "Mr. Holland's Opus (1995)",
 'French Twist (Gazon maudit) (1995)',
 'From Dusk Till Dawn (1996)',
 'White Balloon, The (1995)',
 "Antonia's Line (1995)",
 'Angels and Insects (1995)',
 'Muppet Treasure Island (1996)',
 'Braveheart (1995)',
 'Taxi Driver (1976)',
 'Rumble in the Bronx (1995)',
 'Birdcage, The (1996)',
 'Brothers McMullen, The (1995)',
 'Bad Boys (1995)',
 'Apollo 13 (1995)',
 'Batman Forever (1995)',
 'Belle de jour (1967)',
 'Crimson Tide (1995)',
 'Crumb (1994)',
 'Desperado (1995)',
 'Doom Generation, The (1995)',
 'Free Willy 2: The Adventure Home (1995)',
 'Mad Love (1995)',
 'Nadja (1994)',
 'Net, The (1995

In [516]:
# Функція втрат

def cost_f(X, W, Y, R, l):
    return (1 / 2) * np.sum(((np.dot(X, W.T) - Y) * R) ** 2) + (l / 2) * np.sum(W ** 2) + (l / 2) * np.sum(X ** 2)


In [517]:
# Функція градієнтів

def gradient(X, W, Y, R, l):
    X_grad = np.dot((np.dot(X, W.T) - Y) * R, W) + l * X
    W_grad = np.dot(((np.dot(X, W.T) - Y) * R).T, X) + l * W
    return X_grad, W_grad


In [518]:
# Додамо до датасету свій перелік рейтингів

ratings = {1: 4, 98: 2, 7: 3, 12: 5, 54: 4, 64: 5, 66: 3, 69: 5, 183: 4, 226: 5, 355: 5}

my_ratings = np.zeros(Y.shape[0], dtype=int)
my_R = np.zeros(Y.shape[0], dtype=int)
for m, r in ratings.items():
    my_ratings[m-1] = r
    my_R[m-1] = 1

    print(f"Rated {r} for {movieList[m-1]}")

Y = np.c_[my_ratings, Y]
R = np.c_[my_R, R]
print(Y.shape, R.shape)


Rated 4 for Toy Story (1995)
Rated 2 for Silence of the Lambs, The (1991)
Rated 3 for Twelve Monkeys (1995)
Rated 5 for Usual Suspects, The (1995)
Rated 4 for Outbreak (1995)
Rated 5 for Shawshank Redemption, The (1994)
Rated 3 for While You Were Sleeping (1995)
Rated 5 for Forrest Gump (1994)
Rated 4 for Alien (1979)
Rated 5 for Die Hard 2 (1990)
Rated 5 for Sphere (1998)
(1682, 944) (1682, 944)


In [519]:
# Натренуємо модель

n_factors=30
lr=0.0005
l = 10
iter = 300

X = np.random.random((Y.shape[0], n_factors))
W = np.random.random((Y.shape[1], n_factors))

for i in range(iter):
    cost = cost_f(X, W, Y, R, l)

    X_grad, W_grad = gradient(X, W, Y, R, l)

    X -= lr * X_grad
    W -= lr * W_grad

    print(f'Iteration {i+1}: Cost {cost}')


Iteration 1: Cost 1076260.1935853718
Iteration 2: Cost 394427.6908539278
Iteration 3: Cost 261806.94716021503
Iteration 4: Cost 240749.5015253939
Iteration 5: Cost 225905.2918242945
Iteration 6: Cost 210451.61790192482
Iteration 7: Cost 192937.38919859685
Iteration 8: Cost 177321.615985998
Iteration 9: Cost 164574.72089803423
Iteration 10: Cost 155494.22619704617
Iteration 11: Cost 149068.79326179242
Iteration 12: Cost 144509.2251546339
Iteration 13: Cost 141063.37160881443
Iteration 14: Cost 138296.7402847742
Iteration 15: Cost 135943.26896281147
Iteration 16: Cost 133861.19045124814
Iteration 17: Cost 131972.16997376614
Iteration 18: Cost 130232.7308448077
Iteration 19: Cost 128616.65163928887
Iteration 20: Cost 127106.55579731101
Iteration 21: Cost 125689.71017917455
Iteration 22: Cost 124356.05839463435
Iteration 23: Cost 123097.26784929927
Iteration 24: Cost 121906.2422241139
Iteration 25: Cost 120776.84223979367
Iteration 26: Cost 119703.70456038672
Iteration 27: Cost 118682.1112

In [520]:
predict = np.dot(X, W.T)
predict

array([[3.31274684, 4.5824343 , 3.83089713, ..., 4.01803098, 4.50786114,
        3.39319449],
       [2.70846668, 2.94003142, 3.02011342, ..., 3.08558764, 3.5442909 ,
        3.58293007],
       [2.44858484, 3.21845296, 2.68802195, ..., 2.92439106, 3.35862991,
        2.98318604],
       ...,
       [0.93750594, 1.20377243, 1.01258006, ..., 1.05513459, 1.1667244 ,
        0.93703554],
       [1.17510511, 1.45897102, 1.38217882, ..., 1.35261534, 1.48896152,
        1.51036206],
       [1.1851338 , 1.46816387, 1.34058523, ..., 1.41144718, 1.56775823,
        1.40518022]])

In [521]:

# Виведемо рекомендації для переліку рейтингів що були додані до датасету

my_predict = predict[:,0]
my_predict[np.where(R[:,0] != 0)] = 0

top_items = np.argsort(-my_predict)[:10]

for item in top_items:
    print(f"Predicting rating {my_predict[item]} for movie {movieList[item]}")

Predicting rating 3.796996973448148 for movie Titanic (1997)
Predicting rating 3.766666826264865 for movie Raiders of the Lost Ark (1981)
Predicting rating 3.7018698702274757 for movie Wrong Trousers, The (1993)
Predicting rating 3.685568290735309 for movie Schindler's List (1993)
Predicting rating 3.664855316196729 for movie Good Will Hunting (1997)
Predicting rating 3.650930482759334 for movie Star Wars (1977)
Predicting rating 3.6458876912200693 for movie Close Shave, A (1995)
Predicting rating 3.6132682305300317 for movie Godfather, The (1972)
Predicting rating 3.607094516496455 for movie Empire Strikes Back, The (1980)
Predicting rating 3.589805909431689 for movie Wallace & Gromit: The Best of Aardman Animation (1996)


In [522]:
# Висновок: навіть працює, і, здається, рекомендує не не повну дурню