In [1]:
import logging
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, LoggingHandler

import utils.rec_func as func
import utils.utils_SBert as SBert

In [2]:
movies_all_path = 'dataset/movies_with_keywords.csv'
movies_small_path = 'dataset/movies_5000.csv'
ratings_for_history_path = 'dataset/ratings_for_history.csv'
ratings_for_history_small_path = 'dataset/ratings_for_history_small.csv'

movies, movies_5000, ratings_for_history, ratings_for_history_small = SBert.read_in_csv(movies_all_path, movies_small_path,
                                                                                        ratings_for_history_path, ratings_for_history_small_path)
movies.head()

Unnamed: 0,id,title,AllGenres,AllKeywords
0,862,Toy Story,"Animation,Comedy,Family","boy next door,toy,friendship,friends"
1,8844,Jumanji,"Adventure,Fantasy,Family",
2,15602,Grumpier Old Men,"Romance,Comedy","fishing,best friend,duringcreditsstinger,old men"
3,31357,Waiting to Exhale,"Comedy,Drama,Romance","interracial relationship,chick flick,single mo..."
4,11862,Father of the Bride Part II,Comedy,"aging,confidence,gynecologist,midlife crisis"


In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = SentenceTransformer('distilbert-base-nli-mean-tokens').to(device)
print(device)

cuda:0


In [4]:
embeddings = SBert.embed_sentences(movies_=movies_5000, model_=model)

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

# Test my functions

In [5]:
func.recommend_movies_with_index(movies_5000, embeddings, 888, 5)

The movie 888: Team America: World Police 	 Genres: Music,Adventure,Animation,Action,Comedy

Top 5 recommendations:
1214.  United 93                           Genres: Drama,History,Crime,Thriller,Action
4932.  American Wrestler: The Wizard       Genres: Drama
499.   The Big Brawl                       Genres: Action,Comedy
54.    RoboCop 3                           Genres: Action,Adventure,Crime,Science Fiction,Thriller
2593.  Hot Rods To Hell                    Genres: Action,Thriller


In [6]:
inputs = ['Calling Dr. Death', 'Rambo III', 'Morituri', 'Farscape: The Peacekeeper Wars']
func.recommend_movies_with_titles(inputs, movies_5000, embeddings, model_=model, k=5)

Top 5 recommendations:
2521.  The Art of War III: Retribution     Genres: Adventure,Action,Thriller
1214.  United 93                           Genres: Drama,History,Crime,Thriller,Action
1715.  Inception                           Genres: Action,Thriller,Science Fiction,Mystery,Adventure
348.   Railroaded!                         Genres: Drama,Thriller
3281.  Agente Logan - missione Ypotron     Genres: Action,Adventure,Crime,Science Fiction,Thriller


In [7]:
inputs_2 = ['A movie I just made up', 'Only for the test']
func.recommend_movies_with_titles(inputs_2, movies_5000, embeddings, model_=model, k=5)

Those movies are not in our Database currently, but we can recommend you the following: 

Top 5 recommendations:
4017.  Ellen ten Damme: As I Was Wondering Where This Mixed-up Little Life of Mine Was Leading To Genres: Documentary
819.   After the Rehearsal                 Genres: Drama
3249.  Doug Stanhope: Before Turning the Gun on Himself Genres: 
4554.  Finishing School                    Genres: Drama
4655.  Mumford & Sons: We Wrote This Yesterday Genres: Documentary


In [8]:
UserId = '17'
func.recommend_movies_with_history(UserId, ratings_for_history_small, movies_5000, embeddings, model_=model, k=5, m=10)

Top 5 recommendations:
102.   Beat the Devil                      Genres: Action,Adventure,Comedy,Crime,Drama,Romance
1387.  St. Ives                            Genres: Crime,Action,Thriller,Mystery
907.   The Ghoul                           Genres: Romance,Drama,Horror,Action,Mystery
1049.  Gozu                                Genres: Action,Crime,Thriller,Horror
144.   City of Industry                    Genres: Crime,Thriller,Drama


In [9]:
genres = ['Thriller', 'Crime', 'Comedy', 'Romance']
rec_movies = func.genres_to_movies(genres, model_=model, movies_=movies_5000, embeddings=embeddings, k=5)
rec_movies

Top 5 recommendations:
97.    Charade                             Genres: Comedy,Mystery,Romance,Thriller
2518.  Omar                                Genres: Thriller,Drama,Romance
1464.  Loft                                Genres: Drama,Mystery,Romance,Thriller
4194.  Rajathandhiram                      Genres: Romance,Crime,Drama,Thriller,Comedy
450.   The Body                            Genres: Drama,Mystery,Romance,Thriller


['Charade', 'Omar', 'Loft', 'Rajathandhiram', 'The Body']

# Prepare for training

In [10]:
examples_train = SBert.generate_data(movies_=movies, num=100)
examples_val = SBert.generate_data(movies_=movies, num=30)

SBert.get_datapoints_num(examples_train)

There are 4950 data points in total


In [11]:
train_loader = SBert.load_dataset(examples_train, 16)

print(len(train_loader))

310


In [12]:
evaluator = SBert.get_evaluator(examples_val, 'evaluator1')
evaluator

<sentence_transformers.evaluation.EmbeddingSimilarityEvaluator.EmbeddingSimilarityEvaluator at 0x25912718dd0>

# Train now

In [13]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

In [14]:
train_loss = losses.CosineSimilarityLoss(model=model)

In [None]:
model.fit(
    train_objectives=[(train_loader, train_loss)],
    evaluator=evaluator,
    epochs=5,
    warmup_steps=100,
    optimizer_class=torch.optim.AdamW,
    optimizer_params={'lr': 1e-6},
    evaluation_steps=100,
    output_path='logs/Sentence_Bert/training_nli_distilbert-model'
)