# Recommender Systems

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import re

In [2]:
#loads data from the file
def loadDataSet(filename, tSet=[]):
    with open(filename) as f:
        lines = list(f)
    for x in range(len(lines)):
        tSet.append(lines[x])

In [4]:
#converts the binary features into an int array
def convertToStringArray(t_Set):
    s_Set = [[0]] * len(t_Set)
    for index in range(len(t_Set)): 
        p = ''.join(t_Set[index])
        s_Set[index] = p
    return s_Set

In [6]:
# DATA
# train.dat: user ratings for movies
# test.dat: user-movie pairs but not rating (goal to predict these)
# movie_genres.dat: genres of movies
# movie_directors.dat: directors of movies
# movie_actors.dat: main actors/actresses of movies (ranking: order they appear on IMDb page)
# tags.dat: set of tags in dataset
# user_taggedmovies.dat: tag assignments of movie for each user
# movie_tags.dat: tags assigned to movies (including # times assigned to each movie)
# test.dat: test set of user-movie pairs
# example_entry.dat: sample submission (71299 entries, range 0-5)

In [13]:
# LOADING THE DATA FOR USER RATINGS FOR MOVIES
userRatings = []
loadDataSet('train.dat', userRatings)
userRatings.remove(userRatings[0])
user_id = []
movie_id = []
movie_ranking = []
for i in range(len(userRatings)):
    l = [x for x in list(userRatings[i])]
    t_indexes = [i for i, x in enumerate(l) if x == ' ']
    user_id.append(float(''.join(l[0:t_indexes[0]])))
    movie_id.append(float(''.join(l[t_indexes[0]:t_indexes[1]])))
    movie_ranking.append(float(''.join(l[t_indexes[1]:])))

In [17]:
movie_set = set(movie_id)

9936
641699


In [None]:
# LOADING THE DATA FOR MOVIE RANKINGS
# list of users (repeats)
user = [a[0] for a in userRatings]
# list of movies
movie = [a[1] for a in userRatings]
# list of movie rankings
rank = [a[2] for a in userRatings]
# list of movies with all their rankings
movie_rankings = {}
for i in range(len(movie)):
    if movie[i] in movie_rankings:
        movie_rankings[movie[i]].append(rank[i])
    else:
        movie_rankings[movie[i]] = [rank[i]]
print(movie_rankings[1127])

In [None]:
# LOADING THE DATA FOR MOVIE GENRES
genres = []
loadDataSet('movie_genres.dat', genres)
genres.remove(genres[0])
convertToStringArray(genres)
movie_genres = {}
for i in range(len(genres)):
    l = [x for x in list(genres[i])]
    t_indexes = [i for i, x in enumerate(l) if x == '\t']
    movie_g = float(genres[i][0:t_indexes[0]])
    genre = genres[i][t_indexes[0]:]
    genre = re.sub('\s+', '', genre)
    if movie_g in movie_genres:
        movie_genres[movie_g].append(genre)
    else:
        movie_genres[movie_g] = [genre]
print(movie_genres[99])

In [None]:
# LOADING THE DATA FOR MOVIE DIRECTORS
directors = []
loadDataSet('movie_directors.dat', directors)
directors.remove(directors[0])
movie_directors = {}
for i in range(len(directors)):
    l = [x for x in list(directors[i])]
    t_indexes = [i for i, x in enumerate(l) if x == '\t']
    movie_id = float(''.join(l[0:t_indexes[0]]))
    director_id = ''.join(l[t_indexes[0]:t_indexes[-1]]).replace('\t','')
    director_name = ''.join(l[t_indexes[-1]:]).replace('\t','').replace('\n','')
    movie_directors[movie_id] = [director_id, director_name]
print(movie_directors[100])

In [None]:
# LOADING THE DATA FOR MOVIE ACTORS
actors = []
loadDataSet('movie_actors.dat', actors)
actors.remove(actors[0])
movie_actors = {}
for i in range(len(actors)):
    l = [x for x in list(actors[i])]
    t_indexes = [i for i, x in enumerate(l) if x == '\t']
    movie_id = float(''.join(l[0:t_indexes[0]]))
    actor_id = ''.join(l[t_indexes[0]:t_indexes[1]]).replace('\t','')
    actor_name = ''.join(l[t_indexes[1]:t_indexes[2]]).replace('\t','').replace('\n','')
    actor_ranking = float(''.join(l[t_indexes[2]:]).replace('\t','').replace('\n',''))

In [None]:
# LOADING THE DATA FOR TAGS
tags = []
loadDataSet('tags.dat', tags)
tags.remove(tags[0])
l = [x for x in list(tags[0])]
t_indexes = [i for i, x in enumerate(l) if x == '\t']
movie_id = float(''.join(l[0:t_indexes[0]]))
print(movie_id)
tag = ''.join(l[t_indexes[0]:]).replace('\t','').replace('\n','')
print(tag)

In [None]:
# LOADING THE DATA FOR USER TAGGED MOVIES
tagged_movies = []
loadDataSet('user_taggedmovies.dat', tagged_movies)
tagged_movies.remove(tagged_movies[0])
l = [x for x in list(tagged_movies[0])]
t_indexes = [i for i, x in enumerate(l) if x == ' ']
user_id = float(''.join(l[0:t_indexes[0]]))
print(user_id)
movie_id = float(''.join(l[t_indexes[0]:t_indexes[1]]))
print(movie_id)
tag_id = float(''.join(l[t_indexes[1]:]))
print(tag_id)

In [None]:
# LOADING THE DATA FOR MOVIE TAGS
movie_tags = []
loadDataSet('movie_tags.dat', movie_tags)
movie_tags.remove(movie_tags[0])
l = [x for x in list(movie_tags[0])]
t_indexes = [i for i, x in enumerate(l) if x == '\t']
movie_id = float(''.join(l[0:t_indexes[0]]))
print(movie_id)
tag_id = float(''.join(l[t_indexes[0]:t_indexes[1]]))
print(tag_id)
tag_weight = float(''.join(l[t_indexes[1]:]))
print(tag_weight)