In [36]:
import csv
import numpy as np
import matrixFactorization as mf
import biasMatrixFactorization as bmf
import matplotlib.pyplot as plt
from scipy.sparse.linalg import svds
from basicVisualization import getPopularMovies, getBestMovies, getThreeGenres, basicVisualization
from projection import projection
from surprise import NMF, Dataset, Reader
from surprise.model_selection import cross_validate

BORDER = "==================================================================="


In [37]:

def loadRatings(fileName):
    '''
    Load data from the data.txt file

    Input format: user_id\tmovie_id\trating

    user_id = int
    movie_id = int
    rating = int
    '''
    ratings = []
    f = open(fileName, 'r')

    for line in f:
        ratings.append(line.split())

    return np.asarray(ratings, dtype=int)


def loadMovies(fileName):
    '''
    Load data from the movies.txt file

    Input format: movie_id\tmovie_title\tUnknown\tAction\tAdventure\tAnimation
    \tChildrens\tComedy\tCrime\tDocumentary\tDrama\tFantasy\tFilm-Noir\tHorror
    \tMusical\tMystery\tRomance\tSci-Fi\tThriller\tWar\tWestern

    movie_id = int
    movie_title = string
    {Unknown, Action,..., Western} = bool (0 or 1)

    returns three dictionaries
    movie_ID = {movie_id:movie_title}
    movie_categoty = {movie_id:categories}
        where categories is a numpy array of 0 or 1 representing if the
        movie falls under the category
    movie_genres = {index of genre: genre name}
    '''
    movie_ID = {}
    movie_category = {}

    movie_genres = {
        0:  'Unknown',   1:  'Action',  2:  'Adventure', 3: 'Animation',
        4:  'Childrens', 5:  'Comedy',  6:  'Crime',     7: 'Documentary',
        8:  'Drama',     9:  'Fantasy', 10: 'Film-Noir', 11: 'Horror',
        12: 'Musical',   13: 'Mystery', 14: 'Romance',   15: 'Sci-Fi',
        16: 'Thriller',  17: 'War',     18: 'Western'}

    with open(fileName, encoding='ISO-8859-1') as f:
        reader = csv.reader(f, delimiter='\t')
        for movieData in reader:
            # print(movieData)
            movie_ID[int(movieData[0])] = movieData[1]
            categories = [int(x) for x in movieData[2:]]
            movie_category[int(movieData[0])] = np.asarray(categories)

    return movie_ID, movie_category, movie_genres


def Homework_5_SVD_With_Regularization(train, test):
    print(BORDER)
    print("Homework_5_SVD_With_Regularization - reg")
    M = max(max(train[:, 0]), max(test[:, 0])).astype(int)  # users
    N = max(max(train[:, 1]), max(test[:, 1])).astype(int)  # movies
    print("Factorizing with ", M, " users, ", N, " movies.")
    K = 20

    regs = [10**-4, 10**-3, 10**-2, 10**-1, 1]
    eta = 0.03  # learning rate
    E_ins = []
    E_outs = []

    for reg in regs:
        print("Training model with reg = %s" % (reg))
        U, V, _ = mf.train_model(M, N, K, eta, reg, train)
        E_ins.append(mf.get_err(U, V, train))
        E_outs.append(mf.get_err(U, V, test))

    plt.plot(regs, E_ins, label='$E_{in}$')
    plt.plot(regs, E_outs, label='$E_{out}$')
    plt.title('Error vs. reg')
    plt.xlabel('reg')
    plt.ylabel('Error')
    plt.xscale('log')
    plt.legend()
    plt.savefig('visualizations/method1_reg.png')
    plt.clf()


def Homework_5_SVD(train, test):
    print(BORDER)
    print("Homework_5_SVD")
    M = max(max(train[:, 0]), max(test[:, 0])).astype(int)  # users
    N = max(max(train[:, 1]), max(test[:, 1])).astype(int)  # movies
    print("Factorizing with ", M, " users, ", N, " movies.")
    K = 20

    reg = 10**-1
    eta = 0.03  # learning rate
    U, V, _ = mf.train_model(M, N, K, eta, reg, train)
    E_in = mf.get_err(U, V, train)
    E_out = mf.get_err(U, V, test)
    print("E_in = ", E_in)
    print("E_out = ", E_out)

    return U, V


def SVD_With_Bias(train, test):
    print(BORDER)
    print("SVD_With_Bias")
    M = max(max(train[:, 0]), max(test[:, 0])).astype(int)  # users
    N = max(max(train[:, 1]), max(test[:, 1])).astype(int)  # movies
    print("Factorizing with ", M, " users, ", N, " movies.")
    K = 20

    reg = 10**-1
    eta = 0.03  # learning rate
    U, V, err, a, b, mu = bmf.train_model(M, N, K, eta, reg, train)
    E_in = bmf.get_err(U, V, train, a, b, mu, reg)
    E_out = bmf.get_err(U, V, test, a, b, mu, reg)
    print("E_in = ", E_in)
    print("E_out = ", E_out)

    return U, V


def Get_Err_From_Pred(pred, ratings):
    n_rows = ratings.shape[0]
    error = 0

    for row in range(n_rows):
        user_ind = ratings[row, 0] - 1
        movie_ind = ratings[row, 1] - 1
        rating = ratings[row, 2]
        dev = rating - pred[user_ind, movie_ind]
        error += dev * dev
    return ((1 / 2.) * error) / n_rows


def Off_The_Shelf_SVD_Cross_Valid(dataFile):
    print(BORDER)
    print("Off_The_Shelf_SVD")
    
    reader = Reader(line_format='user item rating', sep='\t')
    dataset = Dataset.load_from_file(dataFile, reader=reader)

    #dataset = dataset.build_full_trainset()
    algo = NMF()
    cross_validate(algo, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    
    
def Off_The_Shelf_SVD(train, test, dataFile):
    print(BORDER)
    print("Off_The_Shelf_SVD")
    
    reader = Reader(line_format='user item rating', sep='\t')
    dataset = Dataset.load_from_file(dataFile, reader=reader)

    dataset = dataset.build_full_trainset()
    algo = NMF()
    algo.fit(dataset)
    U = algo.pu
    V = algo.qi
    bu = algo.bu
    bv = algo.bi
    #pred = np.matrix(np.dot(U, V.T)) + np.matrix(bu).T + np.matrix(bv)
    #E_in = Get_Err_From_Pred(pred, train)
    #E_out = Get_Err_From_Pred(pred, test)
    #print("E_in = ", E_in)
    #print("E_out = ", E_out)
    
    return U, V

def GetAverageMovieRatings(movie_ratings):
    totalRating = {}
    numRatings  = {}
    for (user_id, movie_id, rating) in movie_ratings:
        if movie_id not in totalRating:
            totalRating[movie_id] = rating
            numRatings[movie_id] = 1
        else:
            totalRating[movie_id] = totalRating[movie_id] + rating
            numRatings[movie_id] = numRatings[movie_id] + 1
        
    avgRating = {}
    for movie_id in totalRating:
        avgRating[movie_id] = totalRating[movie_id] / numRatings[movie_id]
    
    return avgRating
    
        
        

In [38]:
directory = 'visualizations/'
dataFile = "data/data.txt"
moviesFile = "data/movies.txt"
trainingFile = "data/train.txt"
testFile = "data/test.txt"

# all movie ratings as movie_ratings = [[user ID, movie ID, rating]]
movie_ratings = loadRatings(dataFile)
# movie info as three dictionaries:
#     movie_ID = {movie_id:movie_title}
#     movie_category = {movie_id:categories}
#     movie_genres = {index of genre: genre name}
movie_ID, movie_category, movie_genres = loadMovies(moviesFile)

#     avgRatings = {movie_id:average rating}
avgRatings = GetAverageMovieRatings(movie_ratings)

train = loadRatings(trainingFile)
test  = loadRatings(testFile)


In [39]:
# c = 0
# for (user_id, movie_id, rating) in movie_ratings:
#     print("{}, {}, {}".format(user_id, movie_id, rating))
#     c = c + 1
#     if (c == 10):
#         break

avgRatings = GetAverageMovieRatings(movie_ratings)
print(avgRatings)


{242: 3.9914529914529915, 302: 4.1616161616161618, 377: 2.1538461538461537, 51: 3.4567901234567899, 346: 3.6428571428571428, 474: 4.2525773195876289, 265: 3.8634361233480177, 465: 3.5647058823529414, 451: 3.3470588235294119, 86: 3.9399999999999999, 257: 3.7458745874587458, 1014: 3.0612244897959182, 222: 3.6602739726027398, 40: 2.8947368421052633, 29: 2.6666666666666665, 785: 3.1538461538461537, 387: 3.3846153846153846, 274: 3.5, 1042: 3.1428571428571428, 1184: 2.5, 392: 3.5441176470588234, 486: 3.796875, 144: 3.8724279835390947, 118: 3.2150170648464163, 1: 3.8783185840707963, 546: 3.0314960629921259, 95: 3.8127853881278537, 768: 3.0769230769230771, 277: 3.464788732394366, 234: 3.7749999999999999, 246: 3.935483870967742, 98: 4.2897435897435896, 193: 3.9171974522292992, 88: 3.539906103286385, 194: 4.0580912863070537, 1081: 2.75, 603: 4.3875598086124405, 796: 3.0833333333333335, 32: 3.7901234567901234, 16: 3.2051282051282053, 304: 3.5369127516778525, 979: 3.2000000000000002, 564: 2.037037

In [40]:
#basicVisualization(movie_ratings, movie_ID, movie_category, movie_genres,
#                   directory)

In [41]:
#Homework_5_SVD_With_Regularization(train, test)

In [42]:
#Off_The_Shelf_SVD_Cross_Valid(dataFile)

In [43]:
U, V = Homework_5_SVD(train, test)
U_proj, V_proj = projection(U, V)



Homework_5_SVD
Factorizing with  943  users,  1682  movies.
epoch: 0, error: 1.018030
epoch: 1, error: 0.865850
epoch: 2, error: 0.800623
epoch: 3, error: 0.770302
epoch: 4, error: 0.749551
epoch: 5, error: 0.730171
epoch: 6, error: 0.713072
epoch: 7, error: 0.691497


KeyboardInterrupt: 

In [None]:
def PlotMovies(V0, V1, titles, plotTitle, showTitles=True):
    plt.scatter(V0, V1)
    
    plt.title(plotTitle)
    
    fig = plt.gcf()
    fig.set_size_inches(18.5, 18.5)

    
    
    if (showTitles):
        for title, x, y in zip(titles, V0, V1):
            plt.annotate(
                title,
                xy=(x, y), xytext=(0, 20),
                textcoords='offset points', ha='center', va='bottom',
                bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
                arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'))

    plt.savefig('visualizations/' + plotTitle + '.png')
    plt.clf()

In [None]:
#===============================================================================================
# Visualize V for any ten movies of your choice from the MovieLens dataset.
# Let's choose the first 10, because I'm lazy
#===============================================================================================
movieIndices = [
    50,   # Star Wars
    69,   # Forrest Gump
    82,   # Jurassic Park
    95,   # Aladdin
    420,  # Alice in Wonderland
    474,  # Dr. Strangelove
    666,  # Blood for Dracula
    780,  # Dumb and Dumber
    1127, # The Truman Show
    1360, # Sexual Life of the Belgians
]

V0 = []
V1 = []
titles = []
for i in movieIndices:
    V0.append(V_proj[0, i - 1])
    V1.append(V_proj[1, i - 1])
    titles.append(movie_ID[i])

PlotMovies(V0, V1, titles, "")


In [None]:
#===============================================================================================
# Visualize V for the ten most popular movies (movies which have received the most ratings).
#===============================================================================================
top10 = getPopularMovies(movie_ratings, movie_ID)

V0 = []
V1 = []
titles = []
for i, _ in top10:
    V0.append(V_proj[0, i - 1])
    V1.append(V_proj[1, i - 1])
    titles.append(movie_ID[i])
    

PlotMovies(V0, V1, titles, "")

In [None]:
#===============================================================================================
# Visualize V for the ten best movies (movies with the highest average ratings).
#===============================================================================================
best10 = getBestMovies(movie_ratings)

V0 = []
V1 = []
titles = []
for i in best10:
    V0.append(V_proj[0, i - 1])
    V1.append(V_proj[1, i - 1])
    titles.append(movie_ID[i])
    

PlotMovies(V0, V1, titles, "")


In [None]:
#===============================================================================================
# Visualize V for ten movies from the Comedy, Romance, Horror genre you selected in Section 4, Basic Visualizations
#===============================================================================================
genreList = [5] # comedies
comedies = getThreeGenres(movie_category, genreList)[0]

genreList = [11] # horrors
horrors = getThreeGenres(movie_category, genreList)[0]

genreList = [15] # romance
romances = getThreeGenres(movie_category, genreList)[0]



V0 = []
V1 = []
titles = []
for i in range(10):
    idx = comedies[i]
    V0.append(V_proj[0, idx - 1])
    V1.append(V_proj[1, idx - 1])
    titles.append(movie_ID[idx])
    
for i in range(10):
    idx = horrors[i]
    V0.append(V_proj[0, idx - 1])
    V1.append(V_proj[1, idx - 1])
    titles.append(movie_ID[idx])
    
for i in range(10):
    idx = romances[i]
    V0.append(V_proj[0, idx - 1])
    V1.append(V_proj[1, idx - 1])
    titles.append(movie_ID[idx])
    

PlotMovies(V0, V1, titles, "", showTitles=True)



In [None]:
#===============================================================================================
# Separate visualizations for ten movies from the Comedy, Romance, Horror genres
#===============================================================================================
genreList = [5] # comedies
comedies = getThreeGenres(movie_category, genreList)[0]

V0 = []
V1 = []
titles = []
for i in range(10):
    idx = comedies[i]
    V0.append(V_proj[0, idx - 1])
    V1.append(V_proj[1, idx - 1])
    titles.append(movie_ID[idx])
    
PlotMovies(V0, V1, titles, "Comedy_Movies", showTitles=True)

In [None]:
genreList = [11] # horrors
horrors = getThreeGenres(movie_category, genreList)[0]

V0 = []
V1 = []
titles = []
for i in range(10):
    idx = horrors[i]
    V0.append(V_proj[0, idx - 1])
    V1.append(V_proj[1, idx - 1])
    titles.append(movie_ID[idx])
PlotMovies(V0, V1, titles, "Horror_Movies", showTitles=True)

In [None]:
genreList = [15] # romance
romances = getThreeGenres(movie_category, genreList)[0]

V0 = []
V1 = []
titles = []
for i in range(10):
    idx = romances[i]
    V0.append(V_proj[0, idx - 1])
    V1.append(V_proj[1, idx - 1])
    titles.append(movie_ID[idx])
PlotMovies(V0, V1, titles, "Romance_Movies", showTitles=True)

In [None]:
def sortMovieYear(movieList, movie_ID):
    '''
    Returns a dictionary of {year:[list of movies]}
    '''
    years = {}
    for movie in movieList:
        try:
            title = movie_ID[movie]
            year = title.rsplit('(', 1)[1][:-1]
            if year in years:
                years[year].append(movie)
            else:
                years[year] = [movie]
        except IndexError:
            print("problem processing movie:\n{} -- {}".format(movie, movie_ID[movie]))
            
    return years

In [None]:
def findAverage(movieList, V_proj):
    num_movies = len(movieList)
    V0 = 0
    V1 = 0
    for movie in movieList:
        ind = movie - 1
        V0 += V_proj[0, ind]
        V1 += V_proj[1, ind]
    return (V0 / num_movies, V1 / num_movies)

In [None]:
#===============================================================================================
# Visualize V for Horror movies across different years
#===============================================================================================

genreList = [11] # Horror
horrors = getThreeGenres(movie_category, genreList)[0]

sorted_years = sortMovieYear(horrors, movie_ID)

V0 = []
V1 = []
titles = []
for i in sorted_years.keys():
    (v0, v1) = findAverage(sorted_years[i], V_proj)
    V0.append(v0)
    V1.append(v1)
    titles.append(i)
PlotMovies(V0, V1, titles, "", showTitles=True)

In [None]:
#===============================================================================================
# Visualize V for Horror movies across different years
#===============================================================================================

genreList = [5] # Comedies
horrors = getThreeGenres(movie_category, genreList)[0]

sorted_years = sortMovieYear(horrors, movie_ID)

V0 = []
V1 = []
titles = []
for i in sorted_years.keys():
    (v0, v1) = findAverage(sorted_years[i], V_proj)
    V0.append(v0)
    V1.append(v1)
    titles.append(i)
PlotMovies(V0, V1, titles, "", showTitles=True)

In [None]:
print(movie_genres)
#===============================================================================================
# Visualize V for Horror movies across different years
#===============================================================================================

for genre_idx in range(19):
    genreList = [genre_idx]
    
    movies = getThreeGenres(movie_category, genreList)[0]

    sorted_years = sortMovieYear(movies, movie_ID)

    V0 = []
    V1 = []
    titles = []
    for i in sorted_years.keys():
        (v0, v1) = findAverage(sorted_years[i], V_proj)
        V0.append(v0)
        V1.append(v1)
        titles.append(i)
    PlotMovies(V0, V1, titles, movie_genres[genreList[0]], showTitles=True)

In [None]:
print(movie_genres)
#===============================================================================================
# Visualize V for Horror movies across different years
#===============================================================================================
def PlotByStrength(X, Y, ratingStrength, plotTitle, showTitles = False):
    maxRating = np.max(ratingStrength)
    minRating = np.min(ratingStrength)

    ratingRange = maxRating - minRating;
    
    colors = []
    for rating in ratingStrength:
        val = (rating - minRating) / ratingRange
        colors.append((1.0, 1.0 - val, 1.0 - val))
    
    plt.scatter(V0, V1, c=colors)
    
    plt.title(plotTitle)
    
    fig = plt.gcf()
    fig.set_size_inches(18.5, 18.5)

    if (showTitles):
        for title, x, y in zip(titles, V0, V1):
            plt.annotate(
                title,
                xy=(x, y), xytext=(0, 20),
                textcoords='offset points', ha='center', va='bottom',
                bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
                arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'))

    plt.show()    
        

for genre_idx in range(19):
    genreList = [genre_idx]
    
    movies = getThreeGenres(movie_category, genreList)[0]

    ratingStrength = []
    V0 = []
    V1 = []
    titles = []
    for idx in movies:
        ratingStrength.append(avgRatings[idx] / 5)
        V0.append(V_proj[0, idx - 1])
        V1.append(V_proj[1, idx - 1])
        titles.append(movie_ID[idx])
        
    PlotByStrength(V0, V1, ratingStrength, movie_genres[genreList[0]], showTitles=False)