## Imports

In [1]:
from IPython.core.interactiveshell import InteractiveShell

# Set shell to show all lines of output
InteractiveShell.ast_node_interactivity = 'all'

In [11]:
from time import time
import math
import heapq  # for retrieval topK
import pickle

import pandas as pd
import numpy as np
import scipy.sparse as sp
np.random.seed(42)

from sklearn.manifold import TSNE
from umap import UMAP
import matplotlib.pyplot as plt

# PyTorch imports
import torch
import torch.nn.functional as F
from torch.autograd import Variable
from torch import nn
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(0)

import warnings
warnings.filterwarnings('ignore')

## Config

In [12]:
PATH = '../input/movies-dataset'
INPUT_PATH = '../input/movies-dataset/user_ratedmovies-timestamps.dat'
OUTPUT_PATH_TRAIN = 'movielens_train_rating'
OUTPUT_PATH_TEST = 'movielens_test_rating'
USER_EMBED_PATH = '../input/recommendation-assignment/user_embedding.npy'
MOVIE_EMBED_PATH = '../input/recommendation-assignment/movie_embedding.npy'
USER_LABEL_MAPPING = '../input/recommendation-assignment/user_label_mapping.p'
MOVIE_LABEL_MAPPING = '../input/recommendation-assignment/movie_label_mapping.p'
LABEL_MOVIE_MAPPING = '../input/recommendation-assignment/label_movie_mapping.p'
USER_MOVIE_MAPPING = '../input/recommendation-assignment/user_movie_mapping.p'
ID_TITLE_MAPPING = '../input/recommendation-assignment/id_title_mapping.p'
kernel_type = "movie_recommend_model"

# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

epochs = 30
batch_size = 256
# first layer is the concatenation of user and item embeddings
layers = [64, 32, 16]
weight_decay = 0.00001
# Number of negative instances to pair with a positive instance while training
num_neg_train = 4
# Number of negative instances to pair with a positive instance while testing
num_neg_test = 100
lr = 0.001
dropout = 0
learner = 'adam'
out = 1

## Data Preprocessing

In [13]:
genre = pd.read_csv('../input/movies-dataset/movie_genres.dat', sep="\t", engine='python')
genre.head()

Unnamed: 0,movieID,genre
0,1,Adventure
1,1,Animation
2,1,Children
3,1,Comedy
4,1,Fantasy


In [35]:
genre_dict = {}
for grp in genre.groupby('movieID'):
    grp = grp[1]
    mid = grp['movieID'].iloc[0]
    genres = list(grp['genre'].unique())
    genre_dict[mid] = genres

In [None]:
movies_df = pd.read_csv('../input/movies-dataset/movies.dat', sep="\t", engine='python')
id_title_mapping = pd.Series(movies_df.title.values, index=movies_df.id).to_dict()

In [21]:
%%time
df = pd.read_csv(INPUT_PATH, sep="\t", engine='python')
df.head()

CPU times: user 6.75 s, sys: 186 ms, total: 6.93 s
Wall time: 6.99 s


Unnamed: 0,userID,movieID,rating,timestamp
0,75,3,1.0,1162160236000
1,75,32,4.5,1162160624000
2,75,110,4.0,1162161008000
3,75,160,2.0,1162160212000
4,75,163,4.0,1162160970000


In [None]:
uid_lbl_mapping = {uid:idx+1 for idx, uid in enumerate(df.userID.unique())}
mid_lbl_mapping = {mid:idx+1 for idx, mid in enumerate(df.movieID.unique())}
lbl_mid_mapping = {idx+1:mid for idx, mid in enumerate(df.movieID.unique())}

In [None]:
user_movie_mapping = {}

for grp in df.groupby('userID'):
    user_df = grp[1]
    user_df = user_df.sort_values('timestamp')
    uid = user_df['userID'].iloc[0]
    mids = list(user_df['movieID'].values)
    user_movie_mapping[uid] = mids

In [None]:
# save user and movie mappings
pickle.dump(uid_lbl_mapping, open(USER_LABEL_MAPPING, "wb"))
pickle.dump(mid_lbl_mapping, open(MOVIE_LABEL_MAPPING, "wb"))
pickle.dump(lbl_mid_mapping, open(LABEL_MOVIE_MAPPING, "wb"))
pickle.dump(user_movie_mapping, open(USER_MOVIE_MAPPING, "wb"))
pickle.dump(id_title_mapping, open(ID_TITLE_MAPPING, "wb"))

In [None]:
def get_train_test_df(df):
    df.sort_values(by=['timestamp'], inplace=True)
    
    # mark last user rating as False, all others to True
    last_user_rating_mask = df.duplicated(subset='userID', keep='last')
    
    # All transactions in train are sorted by timestamp,  
    # We want items marked with a False in test dataset(last user rating)
    train_df = df[last_user_rating_mask]
    test_df = df[~last_user_rating_mask]
    
    train_df.sort_values(by=['userID', 'timestamp'], inplace=True)
    test_df.sort_values(by=['userID', 'timestamp'], inplace=True)
    
    return train_df, test_df

In [None]:
def save_train_test_df():
    # make the dataset
    df = pd.read_csv(INPUT_PATH, sep="\t", engine='python')
    df['userID'] = df['userID'].map(uid_lbl_mapping)
    df['movieID'] = df['movieID'].map(mid_lbl_mapping)
    df['rating'] = 1
    
    # make the dataset
    train_df, test_df = get_train_test_df(df)
    
    # save train and test datasets
    train_df.to_csv(OUTPUT_PATH_TRAIN, header=False, index=False, sep='\t')
    test_df.to_csv(OUTPUT_PATH_TEST, header=False, index=False, sep='\t')
    
    print("Dataset shape = {}".format(df.shape))
    print("Train size = {}, Test size = {}".format(train_df.shape[0], test_df.shape[0]))

In [None]:
%%time
save_train_test_df()

## Dataset

In [None]:
class MovieLensDataset(Dataset):
    def __init__(self, num_negatives_train=5, num_negatives_test=100):
        self.train_matrix = self.load_rating_train_file_as_Matrix(OUTPUT_PATH_TRAIN)
        self.num_users, self.num_items = self.train_matrix.shape
         # make training set with negative sampling
        self.user_input, self.item_input, self.ratings = self.get_train_instances(self.train_matrix, num_negatives_train)
        # make testing set with negative sampling
        self.testRatings = self.load_rating_test_file_as_list(OUTPUT_PATH_TEST)
        self.testNegatives = self.create_negative_file(num_samples=num_negatives_test)
        
        assert len(self.testRatings) == len(self.testNegatives)
        
    def __len__(self):
        return len(self.user_input)
    
    def __getitem__(self, index):
        # get the train data
        user_id = self.user_input[index]
        item_id = self.item_input[index]
        rating = self.ratings[index]
        
        return {
            'user_id': user_id,
            'item_id': item_id,
            'rating': rating
        }
        
    def load_rating_train_file_as_Matrix(self, filename):
        num_users, num_items = 0, 0
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                u, i = int(arr[0]), int(arr[1])
                num_users = max(num_users, u)
                num_items = max(num_items, i)
                line = f.readline()

        # Construct matrix
        mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32)
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                user, item, rating = int(arr[0]), int(arr[1]), float(arr[2])
                if (rating > 0):
                    mat[user, item] = 1.0
                line = f.readline()
                
        return mat
    
    def get_train_instances(self, train, num_negatives):
        user_input, item_input, ratings = [], [], []
        num_users, num_items = train.shape
        for (u, i) in train.keys():
            # positive instance
            user_input.append(u)
            item_input.append(i)
            ratings.append(1)
            # negative instances
            for _ in range(num_negatives):
                j = np.random.randint(1, num_items)
                while (u, j) in train:
                    j = np.random.randint(1, num_items)
                user_input.append(u)
                item_input.append(j)
                ratings.append(0)
                
        return user_input, item_input, ratings
    
    def load_rating_test_file_as_list(self, filename):
        ratingList = []
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                user, item = int(arr[0]), int(arr[1])
                ratingList.append([user, item])
                line = f.readline()
                
        return ratingList
    
    def create_negative_file(self, num_samples=100):
        negativeList = []
        for user_item_pair in self.testRatings:
            user = user_item_pair[0]
            item = user_item_pair[1]
            negatives = []
            for t in range(num_samples):
                j = np.random.randint(1, self.num_items)
                while (user, j) in self.train_matrix or j == item:
                    j = np.random.randint(1, self.num_items)
                negatives.append(j)
            negativeList.append(negatives)
        return negativeList

## Evaluation

In [None]:
def evaluate_model(model, dataset, topK):
    testRatings = dataset.testRatings
    testNegatives = dataset.testNegatives

    hits, ndcgs = [], []
    for idx in range(len(testRatings)):
        (hr, ndcg) = eval_one_rating(idx, topK, model, dataset, testRatings, testNegatives)
        hits.append(hr)
        ndcgs.append(ndcg)
    return (hits, ndcgs)

def eval_one_rating(idx, topK, model, dataset, testRatings, testNegatives):
    rating = testRatings[idx]
    items = testNegatives[idx]
    u = rating[0]

    gtItem = rating[1]
    items.append(gtItem)
    # Get prediction scores
    map_item_score = {}
    users = np.full(len(items), u, dtype='int32')

    feed_dict = {
        'user_id': users,
        'item_id': np.array(items),
    }
    
    predictions = model.predict(feed_dict)
    
    for i in range(len(items)):
        item = items[i]
        map_item_score[item] = predictions[i]

    # Evaluate top rank list
    ranklist = heapq.nlargest(topK, map_item_score, key=map_item_score.get)
    hr = getHitRatio(ranklist, gtItem)
    ndcg = getNDCG(ranklist, gtItem)
    return (hr, ndcg)


def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0


def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

def test_model(model, dataset, topK):
    # put the model in eval mode before testing
    model.eval()
    t1 = time()
    (hits, ndcgs) = evaluate_model(model, dataset, topK)
    hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
    print('Eval: HR = %.4f, NDCG = %.4f [%.1f s]' % (hr, ndcg, time()-t1))
    
    return hr, ndcg

## Model

In [None]:
class MovieRecommendationModel(nn.Module):
    def __init__(self, num_users, num_items, layers=[16, 8], dropout=0.0):
        super().__init__()
        assert (layers[0] % 2 == 0), "layers[0] must be an even number"
        self.dropout = dropout
        
        # user and item embedding layers
        embedding_dim = int(layers[0]/2)
        self.user_embedding = torch.nn.Embedding(num_users, embedding_dim)
        self.item_embedding = torch.nn.Embedding(num_items, embedding_dim)
        
        self.fc_layers = torch.nn.ModuleList()
        
        # hidden dense layers
        for _, (in_size, out_size) in enumerate(zip(layers[:-1], layers[1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size))
            
        # final layer
        self.output_layer = torch.nn.Linear(layers[-1], 1)
        
    def forward(self, data):
        users = data['user_id']
        items = data['item_id']
        user_embedding = self.user_embedding(users)
        item_embedding = self.item_embedding(items)
        
        # concatenate user and item embeddings to form input
        x = torch.cat([user_embedding, item_embedding], 1)
        
        for idx, _ in enumerate(range(len(self.fc_layers))):
            x = self.fc_layers[idx](x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout)
            
        logit = self.output_layer(x)
        rating = torch.sigmoid(logit)
        return rating
    
    def predict(self, feed_dict):
        # return the score, inputs and outputs are numpy arrays
        for key in feed_dict:
            if type(feed_dict[key]) != type(None):
                feed_dict[key] = torch.from_numpy(
                    feed_dict[key]).to(dtype=torch.long, device=device)
        output_scores = self.forward(feed_dict)
        
        return output_scores.cpu().detach().numpy()

## Training

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, epoch_no, device, verbose=True):
    print("=========================================")
    print("Epoch = {}".format(epoch_no))
    t1 = time()
    epoch_loss = []
    
    model.train()
    
    for feed_dict in data_loader:
        for key in feed_dict:
            if type(feed_dict[key]) != type(None):
                feed_dict[key] = feed_dict[key].to(dtype = torch.long, device = device)
        
        prediction = model(feed_dict)
        rating = feed_dict['rating']
        
        rating = rating.float().view(prediction.size())  
        loss = loss_fn(prediction, rating)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss.append(loss.item())
        
    epoch_loss = np.mean(epoch_loss)
    
    if verbose:
        print("Epoch completed {:.1f} s".format(time() - t1))
        print("Train Loss: {}".format(epoch_loss))
        
    return epoch_loss

In [None]:
def main():
    early_stopping = False
    early_stopping_epochs = 5
    early_stopping_counter = 0
    
    best_hr, best_ndcg, best_iter = 0.0, 0.0, 0
    topK = 10
    model_file = f'{kernel_type}.pth'
    t1 = time()
    
    movie_dataset = MovieLensDataset(num_negatives_train=num_neg_train, num_negatives_test=num_neg_test)
    train, testRatings, testNegatives = movie_dataset.train_matrix, movie_dataset.testRatings , movie_dataset.testNegatives
    num_users, num_items = train.shape
    
    print("Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d"
          % (time()-t1, num_users, num_items, train.nnz, len(testRatings)))
    
    train_loader = DataLoader(movie_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    
    model = MovieRecommendationModel(num_users, num_items, layers=layers, dropout=dropout)
    model.to(device)
    
    loss_fn = torch.nn.BCELoss()
    
    optimizer = torch.optim.Adam(model.parameters(), weight_decay=weight_decay)
    
    # Record performance
    hr_list = []
    ndcg_list = []
    BCE_loss_list = []
    
    # Check Init performance
    hr, ndcg = test_model(model, movie_dataset, topK)
    hr_list.append(hr)
    ndcg_list.append(ndcg)
    BCE_loss_list.append(1)
    
    for epoch in range(epochs):
        train_loss = train_epoch(model, train_loader, loss_fn, optimizer, epoch, device)
        hr, ndcg = test_model(model, movie_dataset, topK)
        
        hr_list.append(hr)
        ndcg_list.append(ndcg)
        BCE_loss_list.append(train_loss)
        
        if hr > best_hr:
            best_hr, best_ndcg, best_iter = hr, ndcg, epoch
            torch.save(model.state_dict(), model_file)
            early_stopping_counter = 0
        else:
            early_stopping_counter = early_stopping_counter + 1
            
        if (early_stopping) and (early_stopping_counter == early_stopping_epochs):
            print("Early stopping on epoch:", epoch)
            break
            
            
    best_iter = np.argmax(np.array(hr_list))
    best_hr = hr_list[best_iter]
    best_ndcg = ndcg_list[best_iter]
    print("End. Best Iteration %d:  HR = %.4f, NDCG = %.4f. " %
          (best_iter, best_hr, best_ndcg))

In [None]:
main()

## Save Embeddings

In [None]:
model1 = MovieRecommendationModel(num_users=2114, num_items=10110, layers=layers, dropout=dropout)
model1.load_state_dict(torch.load("./movie_recommend_model.pth"))

user_embeddings = model1.user_embedding.weight.detach().cpu().numpy()
movie_embeddings = model1.item_embedding.weight.detach().cpu().numpy()

np.save(USER_EMBED_PATH, user_embeddings)
np.save(MOVIE_EMBED_PATH, movie_embeddings)

In [None]:
model1

## Embeddings Visualization

In [None]:
def reduce_dim(weights, components = 3, method = 'tsne'):
    """Reduce dimensions of embeddings"""
    if method == 'tsne':
        return TSNE(components, metric = 'cosine').fit_transform(weights)
    elif method == 'umap':
        # Might want to try different parameters for UMAP
        return UMAP(n_components=components, metric = 'cosine', 
                    init = 'random', n_neighbors = 5).fit_transform(weights)

In [None]:
%%time
movie_r = reduce_dim(movie_embeddings, components = 2, method = 'tsne')
movie_r.shape

In [None]:
InteractiveShell.ast_node_interactivity = 'last'

plt.figure(figsize = (10, 8))
plt.plot(movie_r[:, 0], movie_r[:, 1], 'r.')
plt.xlabel('TSNE 1'); 
plt.ylabel('TSNE 2'); 
plt.title('Movie Embeddings Visualized with TSNE');

## Recommendation

In [5]:
#create the KNN model
from sklearn.neighbors import KNeighborsClassifier

In [23]:
def recommend_by_movieid(movie_id):
    movie_embeddings = np.load(MOVIE_EMBED_PATH)
    mid_lbl_mapping = pickle.load(open(MOVIE_LABEL_MAPPING, "rb"))
    lbl_mid_mapping = pickle.load(open(LABEL_MOVIE_MAPPING, "rb"))
    id_title_mapping = pickle.load(open(ID_TITLE_MAPPING, "rb"))
    
    movie_label = mid_lbl_mapping.get(movie_id)
    movie_embedding = movie_embeddings[movie_label]
    
    clf = KNeighborsClassifier(n_neighbors=11)
    clf.fit(movie_embeddings, np.arange(len(movie_embeddings)))
    
    distances, indices = clf.kneighbors(movie_embedding.reshape(1, -1), n_neighbors=10)
    distances, indices = zip(*sorted(zip(distances[0], indices[0])))
    distances, indices = list(distances), list(indices)
    
    sorted_movie_ids = [lbl_mid_mapping[m_idx] for m_idx in indices if m_idx != 0]
    recommend_movies = [id_title_mapping[mid] for mid in sorted_movie_ids]
    
    print("Given movie:", id_title_mapping[movie_id])
    print("Recommended movies:", recommend_movies)

In [25]:
recommend_by_movieid(60)

Given movie: The Indian in the Cupboard
Recommended movies: ['The Indian in the Cupboard', 'The Borrowers', '102 Dalmatians', 'Race to Witch Mountain', 'The Little Rascals', 'The Fox and the Hound', 'The AbsentMinded Professor', 'Operation Dumbo Drop', 'Holes', 'The Adventures of Rocky & Bullwinkle']


In [26]:
def recommend_by_last_viewed(user_id):
    # load user and movie embeddings
    movie_embeddings = np.load(MOVIE_EMBED_PATH)
    
    # load user, movie and user_movie mappings
    uid_lbl_mapping = pickle.load(open(USER_LABEL_MAPPING, "rb"))
    mid_lbl_mapping = pickle.load(open(MOVIE_LABEL_MAPPING, "rb"))
    lbl_mid_mapping = pickle.load(open(LABEL_MOVIE_MAPPING, "rb"))
    user_movie_mapping = pickle.load(open(USER_MOVIE_MAPPING, "rb"))
    id_title_mapping = pickle.load(open(ID_TITLE_MAPPING, "rb"))
    
    # last 5 watched movies by user
    user_last_watched_movies = user_movie_mapping[user_id][-5:]
    user_watched_movies = user_movie_mapping[user_id]
    
    movies = list(mid_lbl_mapping.keys())
    user_unwatched_movies = list(set(movies) - set(user_watched_movies))
    user_unwatched_movies_idxs = [mid_lbl_mapping[mid] for mid in user_unwatched_movies]
    
    clf = KNeighborsClassifier(n_neighbors=11)
    unwatched_movie_embeddings = movie_embeddings[user_unwatched_movies_idxs]
    clf.fit(unwatched_movie_embeddings, user_unwatched_movies_idxs)
    
    m_dist, m_idx = [], []
    for movie_id in user_last_watched_movies:
        top_2 = 0
        movie_label = mid_lbl_mapping.get(movie_id)
        movie_embedding = movie_embeddings[movie_label]
        distances, indices = clf.kneighbors(movie_embedding.reshape(1, -1), n_neighbors=10)
        distances, indices = zip(*sorted(zip(distances[0], indices[0])))
        distances, indices = list(distances), list(indices)
        
        for i, indx in enumerate(indices):
            if indx not in m_idx and indx != 0 and top_2 < 2:
                top_2 += 1
                m_idx.append(indx)
                m_dist.append(distances[i])
        
    m_dist, sorted_movie_indexes = zip(*sorted(zip(m_dist, m_idx)))
    m_dist, sorted_movie_indexes = list(m_dist), list(sorted_movie_indexes)
    sorted_movie_ids = [lbl_mid_mapping[m_idx] for m_idx in sorted_movie_indexes]
    
    # recommend top 10 movies
    recommend_movies = [id_title_mapping[mid] for mid in sorted_movie_ids[:10]]
    print("Recommended Movies:", recommend_movies)

In [28]:
recommend_by_last_viewed(75)

Recommended Movies: ['37�2 le matin', 'Tears of the Sun', 'Austin Powers: The Spy Who Shagged Me', 'The Sure Thing', 'Giulietta degli spiriti', 'Riding the Bullet', "Porky's Revenge", 'Duplex', 'Down from the Mountain', 'Apocalypse Now']


In [6]:
def recommend_by_userid(user_id):
    # load user and movie embeddings
    user_embeddings = np.load(USER_EMBED_PATH)
    movie_embeddings = np.load(MOVIE_EMBED_PATH)
    
    # load user, movie and user_movie mappings
    uid_lbl_mapping = pickle.load(open(USER_LABEL_MAPPING, "rb"))
    mid_lbl_mapping = pickle.load(open(MOVIE_LABEL_MAPPING, "rb"))
    lbl_mid_mapping = pickle.load(open(LABEL_MOVIE_MAPPING, "rb"))
    user_movie_mapping = pickle.load(open(USER_MOVIE_MAPPING, "rb"))
    id_title_mapping = pickle.load(open(ID_TITLE_MAPPING, "rb"))
    
    user_label = uid_lbl_mapping.get(user_id)
    user_embedding = user_embeddings[user_label]
    
    user_watched_movies = user_movie_mapping[user_id]
    movies = list(mid_lbl_mapping.keys())
    user_unwatched_movies = list(set(movies) - set(user_watched_movies))
    user_unwatched_movies_labels = [mid_lbl_mapping[mid] for mid in user_unwatched_movies]
    
    clf = KNeighborsClassifier(n_neighbors=11)
    unwatched_movie_embeddings = movie_embeddings[user_unwatched_movies_labels]
    clf.fit(unwatched_movie_embeddings, user_unwatched_movies_labels)
    
    distances, indices = clf.kneighbors(user_embedding.reshape(1, -1), n_neighbors=10)
    distances, indices = zip(*sorted(zip(distances[0], indices[0])))
    distances, indices = list(distances), list(indices)
    
    sorted_movie_ids = [lbl_mid_mapping[m_idx] for m_idx in indices if m_idx != 0]
    
    recommend_movies = [id_title_mapping[mid] for mid in sorted_movie_ids]
    print("Recommended movies:", recommend_movies)

In [7]:
recommend_by_userid(75)

Recommended movies: ['Manito', 'High Art', 'Copying Beethoven', 'American Psycho II: All American Girl', "The River's Edge", 'La mentale', 'Calamari Union', 'Festen', 'Love the Hard Way', 'Cool Hand Luke']
