In [1]:
sessionInfo()

R version 3.3.3 (2017-03-06)
Platform: i686-pc-linux-gnu (32-bit)
Running under: Ubuntu 16.04.2 LTS

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=it_IT.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=it_IT.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=it_IT.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=it_IT.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

loaded via a namespace (and not attached):
 [1] R6_2.2.0            magrittr_1.5        IRdisplay_0.4.4    
 [4] pbdZMQ_0.2-5        tools_3.3.3         crayon_1.3.2       
 [7] uuid_0.1-2          stringi_1.1.3       IRkernel_0.8.6.9000
[10] jsonlite_1.3        stringr_1.2.0       digest_0.6.12      
[13] repr_0.10           evaluate_0.10      

## Load movie data
Let's start loading dataset provided by https://grouplens.org/datasets/movielens/ and look at their content. It contains a table of movies title and genre, and  a table of ratings expressed by user for watched movies.

In [2]:
movies_df <- read.csv("data/movies.csv")

In [3]:
str(movies_df)

'data.frame':	9125 obs. of  3 variables:
 $ movieId: int  1 2 3 4 5 6 7 8 9 10 ...
 $ title  : Factor w/ 9123 levels "10,000 BC (2008)",..: 8301 4310 3412 8648 2750 3577 6856 8246 7670 3274 ...
 $ genres : Factor w/ 902 levels "Action","Action|Adventure",..: 328 393 686 645 595 241 686 376 1 123 ...


In [4]:
head(movies_df)

movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller


In [5]:
ratings_df <- read.csv("data/ratings.csv")

In [6]:
str(ratings_df)

'data.frame':	100004 obs. of  4 variables:
 $ userId   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ movieId  : int  31 1029 1061 1129 1172 1263 1287 1293 1339 1343 ...
 $ rating   : num  2.5 3 3 2 4 2 2 2 3.5 2 ...
 $ timestamp: int  1260759144 1260759179 1260759182 1260759185 1260759205 1260759151 1260759187 1260759148 1260759125 1260759131 ...


In [7]:
head(ratings_df)

userId,movieId,rating,timestamp
1,31,2.5,1260759144
1,1029,3.0,1260759179
1,1061,3.0,1260759182
1,1129,2.0,1260759185
1,1172,4.0,1260759205
1,1263,2.0,1260759151


## Clean and transform the data
Before going on we clean unused fields and make sure to work on movies being rated

In [8]:
ratings_df$timestamp <- NULL

In [10]:
# get the number of unique users
nr_users <- length(unique(ratings_df$userId))

In [11]:
# get the list of of movies in rating_df and movies_df
movieIds <- unique(movies_df$movieId) #9125
ratingmovieIds <- unique(ratings_df$movieId) #9066

In [12]:
# get rid of movies whch are not rated
movies_df <- movies_df[-which((movieIds %in% ratingmovieIds) == FALSE),]

In [13]:
dim(movies_df); head(movies_df)

movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller


In [14]:
genres <- as.data.frame(movies_df$genres, stringsAsFactors = FALSE)
head(genres)

movies_df$genres
Adventure|Animation|Children|Comedy|Fantasy
Adventure|Children|Fantasy
Comedy|Romance
Comedy|Drama|Romance
Comedy
Action|Crime|Thriller


In [15]:
library(data.table)
?tstrsplit
x = c("abcde", "ghij", "klmnopq")
strsplit(x, "", fixed = TRUE)
tstrsplit(x, "", fixed = TRUE)

In [16]:
genres <- as.data.frame(tstrsplit(genres[,1], 
                                  '[|]', 
                                  type.convert = TRUE), 
                        stringsAsFactors = FALSE)

colnames(genres) <- c(1:10)

In [17]:
dim(genres)
head(genres)

1,2,3,4,5,6,7,8,9,10
Adventure,Animation,Children,Comedy,Fantasy,,,,,
Adventure,Children,Fantasy,,,,,,,
Comedy,Romance,,,,,,,,
Comedy,Drama,Romance,,,,,,,
Comedy,,,,,,,,,
Action,Crime,Thriller,,,,,,,


In [19]:
genre_list <- c("Action", "Adventure", 
                "Animation", "Children", 
                "Comedy", "Crime",
                "Documentary", "Drama", 
                "Fantasy","Film-Noir", 
                "Horror", "Musical", 
                "Mystery","Romance",
                "Sci-Fi", "Thriller", 
                "War", "Western")

# create an empty matrix having a number of rows equal to the number of rated film and a number of columns
# equal to the number of genres

genre_matrix <- matrix(0, dim(genres)[1], 18) 
colnames(genre_matrix) <- genre_list #set column names to genre list
head(genre_matrix)

Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
# for each row assign 1 to the genre column characterizing the movie
for (i in 1:nrow(genres)) {
    for (c in 1:ncol(genres)) {
        genmat_col = which(genre_list == genres[i,c])
        genre_matrix[i, genmat_col] <- 1
 }
}

In [21]:
head(genre_matrix, 3)

Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0


In [22]:
#convert into dataframe
genre_matrix <- as.data.frame(genre_matrix, stringsAsFactors=FALSE)

#convert from characters to integers
for (c in 1:ncol(genre_matrix)) {
  genre_matrix[,c] <- as.integer(genre_matrix[,c])
} 

It is possible to include several attributes and place higher weights on attributes that have been decided to be more important. This could be done with methods such as the **Term Frequency–Inverse Document Frequency algorithm (TFIDF)** but given the time at our disposal we are not coverig them.

In [30]:
binaryratings <- ratings_df

# assign 1 to ratings equal to 4 and 5 and 0 to ratings equaal to 1, 2, 3
binaryratings[,3] <- ifelse(binaryratings[,3] > 3, 1, -1)

In [31]:
head(binaryratings)

userId,movieId,rating
1,31,-1
1,1029,-1
1,1061,-1
1,1129,-1
1,1172,1
1,1263,-1


In [32]:
# use dcast to get a row for each movieId, where each column represent the feedback received by each user
binaryratings <- dcast(binaryratings, movieId~userId, value.var = "rating", na.rm=FALSE)

# replace NA by zero
binaryratings[is.na(binaryratings)] <- 0

dim(binaryratings)
head(binaryratings)


movieId,1,2,3,4,5,6,7,8,9,⋯,662,663,664,665,666,667,668,669,670,671
1,0,0,0,0,0,0,-1,0,1,⋯,0,1,1,0,0,0,0,0,1,1
2,0,0,0,0,0,0,0,0,0,⋯,1,0,0,-1,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,⋯,0,0,0,-1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,⋯,0,0,0,-1,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,⋯,0,0,1,0,1,1,0,0,0,0


In [33]:
# remove movieIds col. Rows are movieIds, cols are userIds
binaryratings = binaryratings[,-1] 
head(binaryratings)

1,2,3,4,5,6,7,8,9,10,⋯,662,663,664,665,666,667,668,669,670,671
0,0,0,0,0,0,-1,0,1,0,⋯,0,1,1,0,0,0,0,0,1,1
0,0,0,0,0,0,0,0,0,0,⋯,1,0,0,-1,0,0,0,0,0,0
0,0,0,0,1,0,0,0,0,0,⋯,0,0,0,-1,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,-1,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,⋯,0,0,1,0,1,1,0,0,0,0


In [34]:
dim(genre_matrix); head(genre_matrix, 3); dim(movies_df); head(movies_df,3)

Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0


movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance


In [35]:
# Calculate the dot product for the User Profiles
result = matrix(0, 18, nr_users)
for (c in 1:ncol(binaryratings)){
  for (i in 1:ncol(genre_matrix)){
    result[i,c] <- ifelse(sum((genre_matrix[,i]) * (binaryratings[,c])) < 0, 0, 1)
  }
}

result <- t(result)

dim(result)

In [36]:
result[1,] #is the vector representing the user 1 taste

# Recommend a movie to the first user

In [37]:
# First user's profile
user.1 <- result[1,]
user.1 <- data.frame(lapply(user.1,function(x){as.integer(x)}))
colnames(user.1) <- genre_list #set column names to genre list
print(user.1)
print(t(user.1))

  Action Adventure Animation Children Comedy Crime Documentary Drama Fantasy
1      0         0         0        0      0     1           1     0       0
  Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
1         1      1       0       1       1      0        0   0       0
            [,1]
Action         0
Adventure      0
Animation      0
Children       0
Comedy         0
Crime          1
Documentary    1
Drama          0
Fantasy        0
Film-Noir      1
Horror         1
Musical        0
Mystery        1
Romance        1
Sci-Fi         0
Thriller       0
War            0
Western        0


In [39]:
library(proxy) # to compute distance

In [40]:
#Calculate Jaccard distance between user profile and all movies
sim_results <- dist(genre_matrix, user.1, method = "Jaccard")

In [43]:
head(sim_results)

In [44]:
closest.movie <- which(sim_results == min(sim_results))
head(closest.movie)

In [45]:
#Recommended movies
movies_df[closest.movie,2]