In [1]:
!pip install fastai --upgrade
!pip install nbdev
!pip install azure-cognitiveservices-search-imagesearch
from utils import *

Collecting fastai
[?25l  Downloading https://files.pythonhosted.org/packages/d4/cf/9140964d3544d904cc718e519e1632a8a42e1eaaf2fafdaaa14716c42336/fastai-2.0.10-py3-none-any.whl (354kB)
[K     |█                               | 10kB 25.9MB/s eta 0:00:01[K     |█▉                              | 20kB 3.0MB/s eta 0:00:01[K     |██▊                             | 30kB 3.6MB/s eta 0:00:01[K     |███▊                            | 40kB 4.0MB/s eta 0:00:01[K     |████▋                           | 51kB 3.5MB/s eta 0:00:01[K     |█████▌                          | 61kB 3.8MB/s eta 0:00:01[K     |██████▌                         | 71kB 4.2MB/s eta 0:00:01[K     |███████▍                        | 81kB 4.4MB/s eta 0:00:01[K     |████████▎                       | 92kB 4.7MB/s eta 0:00:01[K     |█████████▎                      | 102kB 4.7MB/s eta 0:00:01[K     |██████████▏                     | 112kB 4.7MB/s eta 0:00:01[K     |███████████                     | 122kB 4.7MB/s eta 0:0

In [2]:
from fastai.collab import *
from fastai.tabular.all import *
path = untar_data(URLs.ML_100k)

In [3]:
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
                      names=['user', 'movie', 'rating', 'timestamp'])

In [4]:
# science fiction, action, old movies
last_skywalker = np.array([0.98, 0.9, -0.9])

In [5]:
user1 = np.array([0.9, 0.8, -0.6])

In [6]:
np.dot(last_skywalker, user1)

2.1420000000000003

In [7]:
# Dot Product
(user1*last_skywalker).sum()

2.1420000000000003

In [8]:
casablanca = np.array([-0.99, -0.3, 0.8])

In [9]:
# Match between user1 and casablanca
user1.dot(casablanca)

-1.611

We don't know the latent factors.

## Creating the DataLoaders

In [10]:
movies = pd.read_csv(path/'u.item', delimiter='|', encoding='latin-1',
                     usecols=(0,1), names=('movie', 'title'), header=None)
#movies.head()

In [11]:
# Merge with ratings
ratings = ratings.merge(movies)
#ratings.head()

In [12]:
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)
dls.show_batch()

Unnamed: 0,user,title,rating
0,542,My Left Foot (1989),4
1,422,Event Horizon (1997),3
2,311,"African Queen, The (1951)",4
3,595,Face/Off (1997),4
4,617,Evil Dead II (1987),1
5,158,Jurassic Park (1993),5
6,836,Chasing Amy (1997),3
7,474,Emma (1996),3
8,466,Jackie Chan's First Strike (1996),3
9,554,Scream (1996),3


In [13]:
dls.classes

{'title': (#1665) ['#na#',"'Til There Was You (1997)",'1-900 (1994)','101 Dalmatians (1996)','12 Angry Men (1957)','187 (1997)','2 Days in the Valley (1996)','20,000 Leagues Under the Sea (1954)','2001: A Space Odyssey (1968)','3 Ninjas: High Noon At Mega Mountain (1998)'...],
 'user': (#944) ['#na#',1,2,3,4,5,6,7,8,9...]}

In [14]:
n_users = len(dls.classes['user']) # num user classes
n_movies = len(dls.classes['title']) # num title classes
n_factors = 5

# randomly generated latent factors
user_factors = torch.randn(n_users, n_factors)
movie_factors = torch.randn(n_movies, n_factors)

# We can represent look up in an index as a matrix product!
one_hot_3 = one_hot(3, n_users).float()
user_factors.t() @ one_hot_3

tensor([-0.4586, -0.9915, -0.4052, -0.3621, -0.5908])

In [15]:
user_factors[3]

tensor([-0.4586, -0.9915, -0.4052, -0.3621, -0.5908])

In [16]:
one_hot_3

tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,


In [17]:
len(one_hot_3)

944

In [18]:
user_factors[3]

tensor([-0.4586, -0.9915, -0.4052, -0.3621, -0.5908])

## Questions and Answers
* We are not doing working with sparse data in collaborative filtering.
* In practice, we tune the number of latent factors.

## Collaborative Filtering from Scratch

In [19]:
class Example:
  def __init__(self, a):
    self.a = a

  def say(self, x):
    return f'Hello {self.a}, {x}' 

In [20]:
ex = Example('Sylvain')
ex.say('nice to meet you')

'Hello Sylvain, nice to meet you'

In [21]:
class DotProduct(Module):
  def __init__(self, n_users, n_movies, n_factors, y_range=(0, 5.5)):
    self.user_factors = Embedding(n_users, n_factors)
    self.movie_factors = Embedding(n_movies, n_factors)
    self.y_range = y_range

  def forward(self, x):
    users = self.user_factors(x[:,0])
    movies = self.movie_factors(x[:,1])
    return sigmoid_range((users * movies).sum(dim=1), *self.y_range)

In [22]:
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,1.005721,0.999734,00:09
1,0.885945,0.905953,00:09
2,0.693833,0.876002,00:09
3,0.484503,0.874067,00:09
4,0.369077,0.877741,00:09


In [23]:
class DotProductBias(Module):
  def __init__(self, n_users, n_movies, n_factors, y_range=(0, 5.5)):
    self.user_factors = Embedding(n_users, n_factors)
    self.user_bias = Embedding(n_users, 1)
    self.movie_factors = Embedding(n_movies, n_factors)
    self.movie_bias = Embedding(n_movies, 1)
    self.y_range = y_range

  def forward(self, x):
    users = self.user_factors(x[:,0])
    movies = self.movie_factors(x[:,1])
    res = (users * movies).sum(dim=1, keepdim=True)
    res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
    return sigmoid_range(res, *self.y_range)

In [24]:
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat()) # generic learner
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.928226,0.941926,00:09
1,0.821354,0.864699,00:09
2,0.61655,0.869891,00:09
3,0.410764,0.890642,00:09
4,0.292861,0.897089,00:09


In [25]:
x, y = dls.one_batch()
x.shape, y.shape

(torch.Size([64, 2]), torch.Size([64, 1]))

## Weight Decay

In [26]:
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.935395,0.949589,00:10
1,0.877538,0.875935,00:10
2,0.716325,0.835029,00:10
3,0.603237,0.822503,00:09
4,0.476814,0.823543,00:09


In [27]:
class T(Module):
  def __init__(self):
    self.a = torch.ones(3)

In [28]:
L(T().parameters())

(#0) []

In [29]:
class T(Module):
  def __init__(self):
    self.a = nn.Parameter(torch.ones(3))

L(T().parameters())

(#1) [Parameter containing:
tensor([1., 1., 1.], requires_grad=True)]

In [30]:
class T(Module):
  def __init__(self):
    self.a = nn.Linear(1, 3, bias=False)

t = T()
L(t.parameters())

(#1) [Parameter containing:
tensor([[-0.1875],
        [ 0.0791],
        [-0.3082]], requires_grad=True)]

In [31]:
type(t.a.weight)

torch.nn.parameter.Parameter

We can create a tensor as a parameter, with random initialization, like so:

In [32]:
def create_params(size):
  return nn.Parameter(torch.zeros(*size).normal_(0, 0.01))

Let's use this to create `DotProductBias` again, but without `Embedding`

In [33]:
class DotProductBias(Module):
  def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
    self.user_factors = create_params([n_users, n_factors])
    self.user_bias = create_params([n_users])
    self.movie_factors = create_params([n_movies, n_factors])
    self.movie_bias = create_params([n_movies])
    self.y_range = y_range

  def forward(self, x):
    users = self.user_factors[x[:,0]]
    movies = self.movie_factors[x[:,1]]
    res = (users*movies).sum(dim=1)
    res += self.user_bias[x[:,0]] + self.movie_bias[x[:,1]]
    return sigmoid_range(res, *self.y_range)

In [34]:
# Train again
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.916084,0.944496,00:10
1,0.874252,0.873762,00:10
2,0.75207,0.829019,00:10
3,0.587466,0.820674,00:10
4,0.489055,0.821463,00:10


## Interpreting embeddings and biases

In [35]:
movie_bias = learn.model.movie_bias.squeeze()
idxs = movie_bias.argsort()[:5]
[dls.classes['title'][i] for i in idxs]

# We print out the 5 movies with the smallest bias
# People liked thee movies the least

['Children of the Corn: The Gathering (1996)',
 'Lawnmower Man 2: Beyond Cyberspace (1996)',
 'Crow: City of Angels, The (1996)',
 'Robocop 3 (1993)',
 'Beautician and the Beast, The (1997)']

In [36]:
# movies with highest bias
idxs = movie_bias.argsort(descending=True)[:5]
[dls.classes['title'][i] for i in idxs]

['L.A. Confidential (1997)',
 "Schindler's List (1993)",
 'Titanic (1997)',
 'Silence of the Lambs, The (1991)',
 'Star Wars (1977)']

## Using fastai.collab

In [37]:
learn = collab_learner(dls, n_factors=50, y_range=(0,5.5))

In [38]:
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.930385,0.954203,00:10
1,0.870293,0.882454,00:09
2,0.730028,0.837537,00:09
3,0.593131,0.824676,00:09
4,0.488595,0.824378,00:09


In [39]:
# Names of layers
learn.model

EmbeddingDotBias(
  (u_weight): Embedding(944, 50)
  (i_weight): Embedding(1665, 50)
  (u_bias): Embedding(944, 1)
  (i_bias): Embedding(1665, 1)
)

In [40]:
movie_bias = learn.model.i_bias.weight.squeeze()
idxs = movie_bias.argsort(descending=True)[:5]
[dls.classes['title'][i] for i in idxs]

['Titanic (1997)',
 'Shawshank Redemption, The (1994)',
 'Silence of the Lambs, The (1991)',
 'L.A. Confidential (1997)',
 "Schindler's List (1993)"]

## Embedding distance (or distance between 2 movies)



In [41]:
movie_factors = learn.model.i_weight.weight
idx = dls.classes['title'].o2i['Silence of the Lambs, The (1991)']
distances = nn.CosineSimilarity(dim=1)(movie_factors, movie_factors[idx][None])
idx = distances.argsort(descending=True)[1]
dls.classes['title'][idx]

'Farewell to Arms, A (1932)'

## Deep Learning for Collaborative Filtering

In [42]:
embs = get_emb_sz(dls)
embs

[(944, 74), (1665, 102)]

In [46]:
class CollabNN(Module):
  def __init__(self, user_sz, item_sz, y_range=(0, 5.5), n_act=100):
    self.user_factors = Embedding(*user_sz)
    self.item_factors = Embedding(*item_sz)
    self.layers = nn.Sequential(
        nn.Linear(user_sz[1]+item_sz[1], n_act),
        nn.ReLU(),
        nn.Linear(n_act, 1))
    self.y_range = y_range

  def forward(self, x):
    embs = self.user_factors(x[:,0]), self.item_factors(x[:,1])
    x = self.layers(torch.cat(embs, dim=1))
    return sigmoid_range(x, *self.y_range)

In [47]:
model = CollabNN(*embs)

In [48]:
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.01)

epoch,train_loss,valid_loss,time
0,0.920601,0.956213,00:10
1,0.887448,0.903132,00:10
2,0.858648,0.88577,00:10
3,0.789517,0.870664,00:10
4,0.77152,0.874795,00:10


In [49]:
# Create more layers
learn = collab_learner(dls, use_nn=True, y_range=(0, 5.5), layers=[100,50])
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,1.001383,0.969772,00:11
1,0.913566,0.918928,00:12
2,0.880069,0.900082,00:11
3,0.815012,0.865993,00:12
4,0.779292,0.865258,00:11


## Questionnaire

What problem does collaborative filtering solve? It solves the problem of recommending products/items to users.

How does it solve it? It looks at other items the user liked/used, finds other users with similar product preferences, and recommend other items those users have used or liked.

Why might a collaborative filtering predictive model fail to be a very useful recommendation system?

What does a crosstab representation of collaborative filtering data look like? Users on the left axis, items on the top acis, and ratings in the 2-d table.

Write the code to create a crosstab representation of the MovieLens data (you might need to do some web searching!). **[skipped]**

What is a latent factor? Why is it “latent”? These are the underlying concepts of movies that describe user tastes and movie characteristics. They are latest because they are not explicitly defined.

What is a dot product? Calculate a dot product manually using pure Python with lists. It is multiplying elements of two vectors together and summing the result. See code below.

What does pandas.DataFrame.merge do? It joins two DataFrames based on a common column.

What is an embedding matrix? It is the thing we multiply the one-hot-encoded matrix by.

What is the relationship between an embedding and a matrix of one-hot-encoded vectors? Embedding = one-hot-encoded vector * embedding matrix.

Why do we need Embedding if we could use one-hot-encoded vectors for the same thing?

What does an embedding contain before we start training (assuming we’re not using a pretrained model)?

Create a class (without peeking, if possible!) and use it. **[skipped]**

What does x[:,0] return? It returns the first column (all rows)

Rewrite the DotProduct class (without peeking, if possible!) and train a model with it. **[skipped]**

What is a good loss function to use for MovieLens? Why? MSE Loss, as it is good for regression loss.

What would happen if we used cross-entropy loss with MovieLens? How would we need to change the model? IF we used cross-entopy loss, we would need to convert our labels to categories (e.g., Good, Okay, Bad).

What is the use of bias in a dot product model? It helps us show which users are more or less negative in their recommendations than others.

What is another name for weight decay? L2 regularization

Write the equation for weight decay (without peeking!).
```
loss_with_wd = loss + wd*(parameters**2).sum()
```

Write the equation for the gradient of weight decay. Why does it help reduce weights?
```
parameters.grad += wd * 2 * parameters
```

Why does reducing weights lead to better generalization? It reduces chances we will memorize the trained data.

What does argsort do in PyTorch? Return indices that sort a tensor along a given dimension in ascending order by value.

Does sorting the movie biases give the same result as averaging overall movie ratings by movie? Why/why not? No. Biases give us which movies were highest or lowest ranked, irrespective of how well it matched the users. Overall movie ratings take all bias and user factors into account.

How do you print the names and details of the layers in a model? `learn.model`

What is the “bootstrapping problem” in collaborative filtering? How to generate useful recommendations for a new user.

How could you deal with the bootstrapping problem for new users? For new movies? You can ask them to fill out info on their preferences.

How can feedback loops impact collaborative filtering systems? Small number of zealous users can influence overall rankings.

When using a neural network in collaborative filtering, why can we have different numbers of factors for movies and users? THis is because we are concatenating embedding matrices instead of taking the dot product.

Why is there an nn.Sequential in the CollabNN model? It allows us to chain NN layers together.

What kind of model should we use if we want to add metadata about users and items, or information such as date and time, to a collaborative filtering model? We should use a tabular model.

In [55]:
# Dot Product using Python lists

list1 = [1,2,3]
list2 = [4,5,6]
sum([x*y for x, y in zip(list1,list2)])

32