In [1]:
import pandas as pd
import numpy as np
import re
import math
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import GridSearchCV
from surprise import KNNBaseline,SVDpp
from surprise.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
 #loads data from the file
def loadDataSet(filename, tSet=[]):
    filename = '/data/' + filename;
    with open(filename) as f:
        lines = list(f)
    for x in range(len(lines)):
        tSet.append(lines[x])

In [3]:
#converts the binary features into an int array
def convertToStringArray(t_Set):
    s_Set = [[0]] * len(t_Set)
    for index in range(len(t_Set)): 
        p = ''.join(t_Set[index])
        s_Set[index] = p
    return s_Set

In [4]:
# DATA
# train.dat: user ratings for movies
# test.dat: user-movie pairs but not rating (goal to predict these)
# movie_genres.dat: genres of movies
# movie_directors.dat: directors of movies
# movie_actors.dat: main actors/actresses of movies (ranking: order they appear on IMDb page)
# tags.dat: set of tags in dataset
# user_taggedmovies.dat: tag assignments of movie for each user
# movie_tags.dat: tags assigned to movies (including # times assigned to each movie)
# test.dat: test set of user-movie pairs
# example_entry.dat: sample submission (71299 entries, range 0-5)

In [5]:
# LOADING THE DATA FOR USER RATINGS FOR MOVIES
userRatings = []
loadDataSet('train.dat', userRatings)
userRatings.remove(userRatings[0])
users = []
movies = []
ratings = []
movie_ratings = {}
for i in range(len(userRatings)):
    l = [x for x in list(userRatings[i])]
    t_indexes = [i for i, x in enumerate(l) if x == ' ']
    user_id = float(''.join(l[0:t_indexes[0]]))
    movie_id = float(''.join(l[t_indexes[0]:t_indexes[1]]))
    movie_ranking = float(''.join(l[t_indexes[1]:]))
    users.append(int(user_id))
    movies.append(float(movie_id))
    ratings.append(float(movie_ranking))
    if movie_id in movie_ratings:
        movie_ratings[movie_id].append(movie_ranking)
    else:
        movie_ratings[movie_id] = [movie_ranking]
movies_list = list(set(movies))

In [6]:
# average ratings and number of people watched for each movie
mratings = []
uwatched = []
for i in movies_list:
    try:
        mratings.append(np.average(movie_ratings[i]))
        uwatched.append(len(movie_ratings[i]))
    except:
        mratings.append(0)
        uwatched.append(0)

In [7]:
# LOADING THE DATA FOR MOVIE GENRES
genres = []
loadDataSet('movie_genres.dat', genres)
genres.remove(genres[0])
convertToStringArray(genres)
movie_genres = {}
for i in range(len(genres)):
    l = [x for x in list(genres[i])]
    t_indexes = [i for i, x in enumerate(l) if x == '\t']
    movie_id = float(genres[i][0:t_indexes[0]])
    genre = genres[i][t_indexes[0]:]
    genre = re.sub('\s+', '', genre)
    if movie_id in movie_genres:
        movie_genres[movie_id].append(genre)
    else:
        movie_genres[movie_id] = [genre]
genres = []
for i in movies_list:
    try:
        genres.append(movie_genres[i])
    except:
        genres.append([])

In [8]:
# LOADING THE DATA FOR MOVIE DIRECTORS
directors = []
loadDataSet('movie_directors.dat', directors)
directors.remove(directors[0])
movie_directors = {}
for i in range(len(directors)):
    l = [x for x in list(directors[i])]
    t_indexes = [i for i, x in enumerate(l) if x == '\t']
    movie_id = float(''.join(l[0:t_indexes[0]]))
    director_id = ''.join(l[t_indexes[0]:t_indexes[-1]]).replace('\t','')
    director_name = ''.join(l[t_indexes[-1]:]).replace('\t','').replace('\n','')
    movie_directors[movie_id] = director_id
directors = []
for i in movies_list:
    try:
        directors.append(movie_directors[i])
    except:
        directors.append("")

In [9]:
# LOADING THE DATA FOR MOVIE ACTORS
actors = []
loadDataSet('movie_actors.dat', actors)
actors.remove(actors[0])
movie_actors = {}
movie_actor_ratings = {}
for i in range(len(actors)):
    l = [x for x in list(actors[i])]
    t_indexes = [i for i, x in enumerate(l) if x == '\t']
    movie_id = float(''.join(l[0:t_indexes[0]]))
    actor_id = ''.join(l[t_indexes[0]:t_indexes[1]]).replace('\t','')
    actor_name = ''.join(l[t_indexes[1]:t_indexes[2]]).replace('\t','').replace('\n','')
    actor_ranking = float(''.join(l[t_indexes[2]:]).replace('\t','').replace('\n',''))
    if movie_id in movie_actors:
        movie_actors[movie_id].append(actor_id)
        movie_actor_ratings[movie_id].append(actor_ranking)
    else:
        movie_actors[movie_id] = [actor_id]
        movie_actor_ratings[movie_id] = [actor_ranking]
for id in movie_actor_ratings.keys():
    arr = movie_actor_ratings[id]
    average = np.average(arr)
    movie_actor_ratings[id] = average
actors = []
actor_ratings = []
actor_size = []
for i in movies_list:
    try:
        actors.append(movie_actors[i])
        actor_ratings.append(movie_actor_ratings[i])
        actor_size.append(len(movie_actors[i]))
    except:
        actors.append([])
        actor_ratings.append(0)
        actor_size.append(0)

In [10]:
# LOADING THE DATA FOR TAGS
tags = []
loadDataSet('tags.dat', tags)
tags.remove(tags[0])
movie_tags = {}
for i in range(len(tags)):
    l = [x for x in list(tags[i])]
    t_indexes = [i for i, x in enumerate(l) if x == '\t']
    tag_id = float(''.join(l[0:t_indexes[0]]))
    tag = ''.join(l[t_indexes[0]:]).replace('\t','').replace('\n','')
    movie_tags[tag_id] = tag

In [11]:
# LOADING THE DATA FOR MOVIE TAGS
tags_movies = []
loadDataSet('movie_tags.dat', tags_movies)
tags_movies.remove(tags_movies[0])
tagged_movies = {}
for i in range(len(tags_movies)):
    l = [x for x in list(tags_movies[i])]
    t_indexes = [i for i, x in enumerate(l) if x == '\t']
    movie_id = float(''.join(l[0:t_indexes[0]]))
    tag_id = float(''.join(l[t_indexes[0]:t_indexes[1]]))
    tag_weight = float(''.join(l[t_indexes[1]:]))
    arr = []
    for p in range(int(tag_weight)):
        arr.append(movie_tags[int(tag_id)])
    if movie_id in tagged_movies:
        tagged_movies[movie_id] = tagged_movies[movie_id] + arr
    else:
        tagged_movies[movie_id] = arr
tmovies = []
for i in movies_list:
    try:
        tmovies.append(" ".join(tagged_movies[i]))
    except:
        tmovies.append("")

In [12]:
# creates dataframe with user, movie, ratings
ratings_dict = {"movie": movies, "user": users, "rating": ratings}
df = pd.DataFrame(ratings_dict)

In [13]:
# determines movie rating from the average of all users who watched said movie
def user_mean(user_id, movie_id):
    if movie_id in r_matrix:
        mean_rating = df[movie_id].mean()
    else: 
        mean_rating = 3.0
    return mean_rating

In [15]:
# builds the training set for suprise algorithms
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df[["user", "movie", "rating"]], reader)
trainingSet = data.build_full_trainset()

In [17]:
#fine tune parameters for SVDpp
#param_grid = {'lr_all': [0.006, 0.008], 'reg_all': [0.01, 0.03]}
#gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=3,n_jobs=2)

#fine tune parameters for KNNBaseline
#param_grid = {'name': ['pearson', 'cosine'], 'min_support': [3,5,7], 'user_based': [True, False]}
#param_grid = {'name': ['pearson_baseline', 'msd'], 'min_support': [3,5,7], 'user_based': [True, False]}
#param_grid = {'name': ['pearson_baseline', 'pearson'], 'min_support': [3], 'user_based': [True]}
param_grid = {'name': ['pearson_baseline'], 'min_support': [1, 3], 'user_based': [True]}
gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse'], cv=3, n_jobs=2)

gs.fit(data)

#print best combination of parameters
print(gs.best_params['rmse'])

{'name': 'pearson_baseline', 'min_support': 1, 'user_based': True}


In [18]:
# algorithm selection

#algo = SVDpp(lr_all=0.007, reg_all=0.02)

# To use item-based cosine similarity
sim_options = {
    "name": "pearson_baseline",
    "min_support": 1,
    "user_based": True,  # Compute  similarities between items
}
algo = KNNBaseline(sim_options=sim_options)

In [19]:
algo.fit(trainingSet)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x24a34fd7848>

In [None]:
#cross validation
#cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

In [20]:
# imports testing data
testInput = []
loadDataSet('test.dat', testInput)
testInput.remove(testInput[0])
testData = [] 
for i in range(len(testInput)):
    l = [x for x in list(testInput[i])]
    t_indexes = [i for i, x in enumerate(l) if x == ' ']
    user_id = float(''.join(l[0:t_indexes[0]]))
    movie_id = float(''.join(l[t_indexes[0]: (len(l) - 1)]))
    testData.append([user_id, movie_id])
print(len(testData))

71299


In [21]:
# predictions based on test data
predictions = []
for i in range(len(testData)):
    user_id = testData[i][0]
    movie_id = testData[i][1]
    p = algo.predict(user_id, movie_id)
    predictions.append(round(p.est,1))
print(len(predictions))
print(predictions[0])

71299
3.0


In [22]:
# save to text file
solution = ""
for x in range(len(predictions)):
    solution += str(float(predictions[x])) + "\n"
with open("solution.txt", "w") as text_file:
    text_file.write(solution)