# Collaborative Filtering - Intuition

In [1]:
import numpy as np
import pandas as pd

In [3]:
ratings = pd.read_csv("data/ratings_raw.csv")

In [4]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


## Explicit Prediction Rating

In [5]:
items = pd.read_csv("data/items.csv")
users = pd.read_csv("data/users.csv")
ratings = pd.read_csv("data/ratings.csv")

In [6]:
items.head()

Unnamed: 0,movie_id,title,genre_unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,SciFi,Thriller,War,Western,year,overview,original_language,runtime,vote_average,vote_count
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,1995.0,"Led by Woody, Andy's toys live happily in his ...",en,81.0,7.9,10878.0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,1,0,0,1995.0,James Bond must unmask the mysterious head of ...,en,130.0,6.8,2037.0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,1,0,0,1995.0,It's Ted the Bellhop's first night on the job....,en,98.0,6.1,1251.0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,1995.0,Chili Palmer is a Miami mobster who gets sent ...,en,105.0,6.5,501.0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,1,0,0,1995.0,An agoraphobic psychologist and a female detec...,en,124.0,6.5,424.0


## Small Data

| movie_id | title                 |
| -------: | :-------------------- |
|      1   | Toy Story (1995)      |
|     71   | Lion King, The (1994) |
|     95   | Aladdin (1992)        |
|     50   | Star Wars (1972)      |
|    176   | Aliens (1986)         |
|     82   | Jurassic Park (1993)  |



In [54]:
sample_movie_id = [1, 71, 95, 50, 176, 82]
sample_user_id = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [55]:
sample_items = items[items.movie_id.isin(sample_movie_id)]
sample_users = users[users.user_id.isin(sample_user_id)]

In [56]:
sample_items.head()

Unnamed: 0,movie_id,title,genre_unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,SciFi,Thriller,War,Western,year,overview,original_language,runtime,vote_average,vote_count
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,1995.0,"Led by Woody, Andy's toys live happily in his ...",en,81.0,7.9,10878.0
49,50,Star Wars (1977),0,1,1,0,0,0,0,0,...,1,0,1,0,1977.0,Princess Leia is captured and held hostage by ...,en,121.0,8.2,11881.0
71,71,"Lion King, The (1994)",0,0,0,1,1,0,0,0,...,0,0,0,0,1994.0,A young lion cub named Simba can't wait to be ...,en,89.0,8.2,10814.0
83,82,Jurassic Park (1993),0,1,1,0,0,0,0,0,...,1,0,0,0,1993.0,A wealthy entrepreneur secretly creates a them...,en,127.0,7.9,9416.0
96,95,Aladdin (1992),0,0,0,1,1,1,0,0,...,0,0,0,0,1992.0,Princess Jasmine grows tired of being forced t...,en,90.0,7.6,6812.0


In [10]:
sample_users

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
5,6,42,M,executive,98101
6,7,57,M,administrator,91344
7,8,36,M,administrator,5201
8,9,29,M,student,1002
9,10,53,M,lawyer,90703


In [11]:
rating_collect = ratings.user_id.isin(sample_user_id) & ratings.movie_id.isin(sample_movie_id)

In [49]:
sample = ratings[rating_collect]

In [50]:
sample.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
1052,2,50,5,888552084
1090,8,50,5,879362124
1333,5,1,4,875635748
3672,6,95,2,883602133
4280,1,82,5,878542589


In [17]:
import altair as alt

In [53]:
len(sample.user_id.unique())

14

In [52]:
pd.crosstab(index=sample.user_id, columns=sample.movie_id, values=sample.rating, aggfunc="mean")

movie_id,1,50,71,82,95,176
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,5.0,5.0,3.0,5.0,4.0,5.0
2,4.0,5.0,,,,
4,,5.0,,,,
5,4.0,4.0,,,4.0,3.0
6,4.0,4.0,4.0,,2.0,
7,,5.0,5.0,3.0,,3.0
8,,5.0,,5.0,,5.0
9,,5.0,,,,
10,4.0,5.0,,4.0,,4.0
11,,,,,,3.0


In [43]:
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, dot

In [57]:
n_items = 6
n_users = 14
n_latent_factors = 2

In [60]:
item_input = Input(shape=[1], name='Item')
item_embedding = Embedding(n_items + 1, n_latent_factors, name='Item-Embedding')(item_input)
item_vec = Flatten(name='FlattenItems')(item_embedding)

user_input = Input(shape=[1], name='User')
user_embedding = Embedding(n_users + 1, n_latent_factors, name='User-Embedding')(user_input)
user_vec = Flatten(name='FlattenUsers')(user_embedding)

prod = dot([item_vec, user_vec], axes=1, name='DotProduct')
model = Model([user_input, item_input], prod)
model.compile('adam', 'mean_squared_error')

In [62]:
sample.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
1052,2,50,5,888552084
1090,8,50,5,879362124
1333,5,1,4,875635748
3672,6,95,2,883602133
4280,1,82,5,878542589


In [61]:
history = model.fit([sample.movie_id, sample.user_id], sample.rating, epochs=10, verbose=0)

InvalidArgumentError: indices[0,0] = 176 is not in [0, 15)
	 [[{{node User-Embedding_9/embedding_lookup}}]]