In [1]:
import logging
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, LoggingHandler

#### 1. Input a movie index, return recommendations:

In [6]:
def recommend_movies_with_index(movies_, embeddings, movie_index, k=5):
    if movie_index >= movies_.shape[0]:
        print(f'Invalid movie index {movie_index}!')
        return
    
    k = 5 if k <= 0 else k
    
    movie_emb = embeddings[movie_index]  # find the corresponding embedding vector first 
    cos_simi = cosine_similarity([movie_emb], embeddings)[0]  # compute the similarity to all movies
    
    top_indices = np.argsort(-cos_simi)[1:k+1]  # because the largest cosine similarity must be with itself
    rec_movies = movies_.iloc[top_indices]  # get the recommendation movies -> DataFrame

    print(f'The movie {movie_index}: {movies_.title.iloc[movie_index]} \t Genres: {movies_.AllGenres.iloc[movie_index]}\n')
    print(f'Top {k} recommendations:')
    
    for index, movie in rec_movies.iterrows():
        print('{:<6} {:<35} {}'.format(f'{index}.', f'{movie["title"]}', f'Genres: {movie["AllGenres"]}'))

#### 2. Input a list of movie(s), return recommendations: 

In [7]:
def recommend_movies_with_titles(titles, movies_, embeddings, model_, k=5):
    if not titles:
        print('Enter at least ONE movie title!')
        return 
    
    k = 5 if k <= 0 else k
    
    movies_input = movies_[movies_['title'].isin(titles)]  # All input movies
    
    # If ANY movie title that the user enters is NOT in our dataset
    # The only useful information we have is those titles -> encode those titles
    if movies_input.empty:
        print('Those movies are not in our dataset currently, but we can recommend you the following: \n')
        
        movies_remaining = movies_  # Every movie in the dataset is the "remaining movie"
        
        user_inputs = ','.join(titles)
        movies_input_emb = model_.encode(user_inputs)
        combined_emb = np.array(movies_input_emb)
        
    # If the user enters valid titles, then just get the embeddings for the movies
    else:
        movies_remaining = movies_[~movies_['title'].isin(titles)]  # All remaining movies
    
        movies_input_emb = embeddings[movies_input.index]  # embeddings is np.array
        combined_emb = np.mean(movies_input_emb, axis=0)
    
    movies_remaining_emb = embeddings[movies_remaining.index]
    
    cos_simi = cosine_similarity([combined_emb], movies_remaining_emb)[0]
    
    top_indices = np.argsort(-cos_simi)[:k]
    recommend_movies = movies_remaining.iloc[top_indices]
    
    print(f'Top {k} recommendations:')
    for index, movie in recommend_movies.iterrows():
        print('{:<6} {:<35} {}'.format(f'{index}.', f'{movie["title"]}', f'Genres: {movie["AllGenres"]}'))

#### 3. Input User ID, return recommendations:

In [8]:
def recommend_movies_with_history(user_id, ratings_, movies_, embeddings, k=5, m=3):
    user_list = ratings_['userId'].unique().tolist()
    if user_id not in user_list:
        print(f'Cannot find the user with id: {user_id}')
        return 
    
    user_history = ratings_[ratings_['userId'] == user_id]  # get the user's history (all)
    
    m = 3 if m <= 0 else m 
    m = min(m, len(user_history))
    
    # Sort movies with the highest user ratings and recently watched
    sorted_history = user_history.sort_values(by=['rating', 'timestamp'])
    sorted_m_history = sorted_history.iloc[:m]  # select the first m movies
    
    user_history_id = sorted_m_history['movieId'].tolist()
    
    history_titles = movies_[movies_['id'].isin(user_history_id)]
    history_titles = history_titles['title'].tolist()
    
    recommend_movies_with_titles(history_titles, movies_, embeddings, k)

### Additional: 
#### 4. From genres to movies

In [9]:
def genres_to_movies(genres_, model_, movies_, embeddings, k=5):
    if not genres_:
        print('Enter at least ONE genre!')
        return
    
    k = 5 if k <= 0 else k
    
    all_genres = ','.join(genres_)
    all_genres_emb = model_.encode(all_genres)
    all_genres_emb = np.array(all_genres_emb)
    
    cos_simi = cosine_similarity([all_genres_emb], embeddings)[0]
    
    top_indices = np.argsort(-cos_simi)[:k]
    recommend_movies = movies_.iloc[top_indices]
    
    print(f'Top {k} recommendations:')
    for index, movie in recommend_movies.iterrows():
        print('{:<6} {:<35} {}'.format(f'{index}.', f'{movie["title"]}', f'Genres: {movie["AllGenres"]}'))
        
    return recommend_movies['title'].tolist()

#### Load the datasets:

In [10]:
movies_all_path = 'dataset/movies_with_keywords.csv'
movies_small_path = 'dataset/movies_5000.csv'
ratings_for_history_path = 'dataset/ratings_for_history.csv'
ratings_for_history_small_path = 'dataset/ratings_for_history_small.csv'

movies = pd.read_csv(movies_all_path, keep_default_na=False, dtype=str)
movies_5000 = pd.read_csv(movies_small_path, keep_default_na=False, dtype=str)
ratings_for_history = pd.read_csv(ratings_for_history_path, dtype=str)
ratings_for_history_small = pd.read_csv(ratings_for_history_small_path, dtype=str)

movies.head()

Unnamed: 0,id,title,AllGenres,AllKeywords
0,862,Toy Story,"Animation,Comedy,Family","rivalry,toy comes to life,boy next door,toy"
1,8844,Jumanji,"Adventure,Fantasy,Family",
2,15602,Grumpier Old Men,"Romance,Comedy","fishing,best friend,duringcreditsstinger,old men"
3,31357,Waiting to Exhale,"Comedy,Drama,Romance","divorce,chick flick,interracial relationship,s..."
4,11862,Father of the Bride Part II,Comedy,"mother daughter relationship,baby,aging,daughter"


In [11]:
ratings_for_history_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4250 entries, 0 to 4249
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     4250 non-null   object
 1   movieId    4250 non-null   object
 2   rating     4250 non-null   object
 3   timestamp  4250 non-null   object
dtypes: object(4)
memory usage: 132.9+ KB


In [12]:
ratings_for_history['rating'] = ratings_for_history['rating'].astype(float)
ratings_for_history['timestamp'] = ratings_for_history['timestamp'].astype(int)
ratings_for_history_small['rating'] = ratings_for_history_small['rating'].astype(float)
ratings_for_history_small['timestamp'] = ratings_for_history_small['timestamp'].astype(int)

ratings_for_history_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4250 entries, 0 to 4249
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     4250 non-null   object 
 1   movieId    4250 non-null   object 
 2   rating     4250 non-null   float64
 3   timestamp  4250 non-null   int32  
dtypes: float64(1), int32(1), object(2)
memory usage: 116.3+ KB


#### Load the model, and encode the words (descriptions -> titles + genres + keywords):

In [13]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = SentenceTransformer('distilbert-base-nli-mean-tokens').to(device)
print(device)

cuda:0


In [14]:
def print_model_summary(model):
    print("Model summary:\n")
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        param = parameter.numel()
        total_params += param
        print(f"{name}: {param} total parameters")
    print(f"\nTotal trainable parameters: {total_params}")

print_model_summary(model)

Model summary:

0.auto_model.embeddings.word_embeddings.weight: 23440896 total parameters
0.auto_model.embeddings.position_embeddings.weight: 393216 total parameters
0.auto_model.embeddings.LayerNorm.weight: 768 total parameters
0.auto_model.embeddings.LayerNorm.bias: 768 total parameters
0.auto_model.transformer.layer.0.attention.q_lin.weight: 589824 total parameters
0.auto_model.transformer.layer.0.attention.q_lin.bias: 768 total parameters
0.auto_model.transformer.layer.0.attention.k_lin.weight: 589824 total parameters
0.auto_model.transformer.layer.0.attention.k_lin.bias: 768 total parameters
0.auto_model.transformer.layer.0.attention.v_lin.weight: 589824 total parameters
0.auto_model.transformer.layer.0.attention.v_lin.bias: 768 total parameters
0.auto_model.transformer.layer.0.attention.out_lin.weight: 589824 total parameters
0.auto_model.transformer.layer.0.attention.out_lin.bias: 768 total parameters
0.auto_model.transformer.layer.0.sa_layer_norm.weight: 768 total parameters
0.

In [15]:
names_genres_keywords = movies_5000['title'] + ',' + movies_5000['AllGenres'] + ',' + movies_5000['AllKeywords']
names_genres_keywords = names_genres_keywords.tolist()
names_genres_keywords[:5]

['Grumpier Old Men,Romance,Comedy,fishing,best friend,duringcreditsstinger,old men',
 'Waiting to Exhale,Comedy,Drama,Romance,divorce,chick flick,interracial relationship,single mother',
 'GoldenEye,Adventure,Action,Thriller,red army,electromagnetic pulse,special car,computer virus',
 'Money Train,Action,Comedy,Crime,subway,brother brother relationship,new york subway,train robbery',
 'Twelve Monkeys,Science Fiction,Thriller,Mystery,monkey,dystopia,stockholm syndrome,subterranean']

In [16]:
embedding = model.encode(names_genres_keywords, show_progress_bar=True)
embedding = np.array(embedding)

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

#### Test my functions:

In [17]:
recommend_movies_with_index(movies_5000, embedding, 888, 5)

The movie 888: The Manchurian Candidate 	 Genres: Drama,Thriller,Mystery

Top 5 recommendations:
2405.  Calling Dr. Death                   Genres: Thriller,Mystery
4413.  Psycho-Pass: The Movie              Genres: Science Fiction,Animation,Action
261.   Rambo III                           Genres: Action,Adventure,Thriller,War
875.   Morituri                            Genres: Action,Drama,Thriller,War
1451.  Farscape: The Peacekeeper Wars      Genres: Action,Comedy,Romance,Science Fiction,Thriller


In [18]:
inputs = ['Calling Dr. Death', 'Rambo III', 'Morituri', 'Farscape: The Peacekeeper Wars']
recommend_movies_with_titles(inputs, movies_5000, embedding, model_=model, k=5)

Top 5 recommendations:
4858.  Riding with Death                   Genres: Science Fiction,TV Movie,Action,Thriller
1860.  The Kings of Mykonos                Genres: Action,Comedy,Thriller,Science Fiction
888.   The Manchurian Candidate            Genres: Drama,Thriller,Mystery
4675.  The Tribe                           Genres: Thriller,Drama,Science Fiction
3485.  Spectral                            Genres: Thriller,Action,Science Fiction


In [19]:
inputs_2 = ['A movie I just made up', 'Only for the test']
recommend_movies_with_titles(inputs_2, movies_5000, embedding, model_=model, k=5)

Those movies are not in our dataset currently, but we can recommend you the following: 

Top 5 recommendations:
4663.  Has the Film Already Started?       Genres: 
1726.  The Man Who Left His Will on Film   Genres: Drama
1900.  The Target Shoots First             Genres: Documentary
3059.  The Lumière Brothers' First Films   Genres: Documentary
2793.  Mail Early for Xmas                 Genres: Animation


In [20]:
UserId = '17'
recommend_movies_with_history(UserId, ratings_for_history_small, movies_5000, embedding, k=5, m=10)

Top 5 recommendations:
1123.  Ferpect Crime                       Genres: Thriller,Comedy,Crime
3724.  Buttwhistle                         Genres: Mystery,Romance,Comedy,Crime,Drama
657.   Above Suspicion                     Genres: Drama,Thriller
4265.  Captain Fantastic                   Genres: Adventure,Comedy,Drama,Romance
928.   Tales of Manhattan                  Genres: Comedy,Drama,Romance


In [21]:
genres = ['Thriller', 'Crime', 'Comedy', 'Romance']
rec_movies = genres_to_movies(genres, model_=model, movies_=movies_5000, embeddings=embedding, k=5)
rec_movies

Top 5 recommendations:
115.   Charade                             Genres: Comedy,Mystery,Romance,Thriller
85.    Captives                            Genres: Drama,Thriller,Mystery,Romance
4203.  Rajathandhiram                      Genres: Romance,Crime,Drama,Thriller,Comedy
1949.  Carancho                            Genres: Thriller,Drama,Romance
3337.  Amiche da morire                    Genres: Romance,Crime,Comedy


['Charade', 'Captives', 'Rajathandhiram', 'Carancho', 'Amiche da morire']

## Above is just testing my functions.
## Now we can finally fine tune the model!!!

#### The function compute the similarity of two movies:
Only consider genres, because there are way too many different keywords. But of course, if two movies have same keywords, which implies a much stronger correlation than merely having similar genres.

This can be used for improvement in the future.

In [22]:
def if_similar(movie1, movie2):
    genres1 = set(movie1['AllGenres'].strip().split(','))
    genres2 = set(movie2['AllGenres'].strip().split(','))
    
    min_len = min(len(genres1), len(genres2))  # the minimum of two lengths
    if min_len == 0:
        return 0.1  # I think return 0.1 instead of 0.0 is more appropriate (it's just my silly thought, maybe is not) 
    
    common_genres = genres1 & genres2  # Find the overlapped genres
    common_genres_num = len(common_genres)
    
    similarity = (common_genres_num / float(min_len))
    
    return round(similarity, 2)

In [23]:
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

#### Now generate data for training. Randomly selecting 100 movies can actually generate (99 + 0) x 100 / 2 = 4950 data points
By the way, I'm using the subset of movies here, and we can also use the full movies' dataset. It's the same thing, because the number of generated data points only related to the "movies_num" variable here.

In [24]:
movies_num = 100
sample_index = np.random.choice(range(len(movies_5000)), size=movies_num, replace=False)
sample_index = np.sort(sample_index)

examples = []
for i in sample_index:
    movie1 = movies.iloc[i]
    for j in sample_index:
        if j > i:
            movie2 = movies.iloc[j]
            similarity = if_similar(movie1, movie2)
            
            examples.append(InputExample(texts=[movie1['title'] + ',' + movie1['AllGenres'] + ',' +  movie1['AllKeywords'], movie2['title'] + ',' + movie2['AllGenres'] + ',' + movie2['AllKeywords']], label=similarity))

print(f'{len(examples)} data points in total')
print(examples[2024])

4950 data points in total
<InputExample> label: 0.0, texts: Get on the Bus,Drama,washington d.c.,bus,march; Sleeper,Comedy,Romance,Science Fiction,cyrogenics,future,government,robot


In [25]:
text = examples[2024].texts[0]
text

'Get on the Bus,Drama,washington d.c.,bus,march'

In [26]:
movies_num_val = 30
sample_index_val = np.random.choice(range(len(movies_5000)), size=movies_num_val, replace=False)
sample_index_val = np.sort(sample_index_val)

examples_val = []
for i in sample_index_val:
    movie1 = movies_5000.iloc[i]
    for j in sample_index_val:
        if j > i:
            movie2 = movies_5000.iloc[j]
            similarity = if_similar(movie1, movie2)
            
            examples_val.append(InputExample(texts=[movie1['title'] + ',' + movie1['AllGenres'] + ',' + movie1['AllKeywords'], movie2['title'] + ',' + movie2['AllGenres'] + ',' + movie2['AllKeywords']], label=similarity))

print(f'{len(examples_val)} data points in total')
print(examples_val[200])

435 data points in total
<InputExample> label: 0.5, texts: Ask the Dust,Drama,Romance,racism,kiss,italy,beach; The Cheetah Girls 2,Comedy,Drama,Family,Music,TV Movie,girl group,sequel,spain,music festival


In [27]:
my_dataset = MyDataset(examples)
train_loader = DataLoader(my_dataset, batch_size=16, shuffle=True)

print(len(train_loader))

310


In [28]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

In [29]:
evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(examples_val, name='evaluator1')

#### The model.fit() here is FOR DEMONSTRATION ONLY.
#### The REAL model was trained on Google Colab. It was trained using movies_num = 300 (44,850 training data in total). movies_num_val = 70 (2,415 validation data in total), and the model performance was validated every 400 batch. 

In [28]:
train_loss = losses.CosineSimilarityLoss(model=model)
# optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

model.fit(
    train_objectives=[(train_loader, train_loss)],
    evaluator=evaluator,
    epochs=5,
    warmup_steps=100,
    optimizer_class=torch.optim.AdamW,
    optimizer_params={'lr': 1e-6},
    evaluation_steps=100,
    output_path='model/training_nli_distilbert-model'
)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

2024-05-05 11:53:40 - EmbeddingSimilarityEvaluator: Evaluating the model on the evaluator1 dataset in epoch 0 after 100 steps:
2024-05-05 11:53:42 - Cosine-Similarity :	Pearson: 0.9287	Spearman: 0.8457
2024-05-05 11:53:42 - Manhattan-Distance:	Pearson: 0.9174	Spearman: 0.8459
2024-05-05 11:53:42 - Euclidean-Distance:	Pearson: 0.9170	Spearman: 0.8441
2024-05-05 11:53:42 - Dot-Product-Similarity:	Pearson: 0.9279	Spearman: 0.8452
2024-05-05 11:53:42 - Save model to model/training_nli_distilbert-model
2024-05-05 11:54:00 - EmbeddingSimilarityEvaluator: Evaluating the model on the evaluator1 dataset in epoch 0 after 200 steps:
2024-05-05 11:54:01 - Cosine-Similarity :	Pearson: 0.9267	Spearman: 0.8453
2024-05-05 11:54:01 - Manhattan-Distance:	Pearson: 0.9159	Spearman: 0.8453
2024-05-05 11:54:01 - Euclidean-Distance:	Pearson: 0.9157	Spearman: 0.8436
2024-05-05 11:54:01 - Dot-Product-Similarity:	Pearson: 0.9259	Spearman: 0.8449
2024-05-05 11:54:19 - EmbeddingSimilarityEvaluator: Evaluating the

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

2024-05-05 11:54:42 - EmbeddingSimilarityEvaluator: Evaluating the model on the evaluator1 dataset in epoch 1 after 100 steps:
2024-05-05 11:54:43 - Cosine-Similarity :	Pearson: 0.9287	Spearman: 0.8453
2024-05-05 11:54:43 - Manhattan-Distance:	Pearson: 0.9189	Spearman: 0.8459
2024-05-05 11:54:43 - Euclidean-Distance:	Pearson: 0.9185	Spearman: 0.8437
2024-05-05 11:54:43 - Dot-Product-Similarity:	Pearson: 0.9281	Spearman: 0.8448
2024-05-05 11:55:01 - EmbeddingSimilarityEvaluator: Evaluating the model on the evaluator1 dataset in epoch 1 after 200 steps:
2024-05-05 11:55:02 - Cosine-Similarity :	Pearson: 0.9282	Spearman: 0.8451
2024-05-05 11:55:02 - Manhattan-Distance:	Pearson: 0.9167	Spearman: 0.8458
2024-05-05 11:55:02 - Euclidean-Distance:	Pearson: 0.9161	Spearman: 0.8434
2024-05-05 11:55:02 - Dot-Product-Similarity:	Pearson: 0.9275	Spearman: 0.8448
2024-05-05 11:55:20 - EmbeddingSimilarityEvaluator: Evaluating the model on the evaluator1 dataset in epoch 1 after 300 steps:
2024-05-05 

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

2024-05-05 11:55:42 - EmbeddingSimilarityEvaluator: Evaluating the model on the evaluator1 dataset in epoch 2 after 100 steps:
2024-05-05 11:55:43 - Cosine-Similarity :	Pearson: 0.9270	Spearman: 0.8441
2024-05-05 11:55:43 - Manhattan-Distance:	Pearson: 0.9165	Spearman: 0.8451
2024-05-05 11:55:43 - Euclidean-Distance:	Pearson: 0.9160	Spearman: 0.8430
2024-05-05 11:55:43 - Dot-Product-Similarity:	Pearson: 0.9264	Spearman: 0.8438
2024-05-05 11:56:01 - EmbeddingSimilarityEvaluator: Evaluating the model on the evaluator1 dataset in epoch 2 after 200 steps:
2024-05-05 11:56:03 - Cosine-Similarity :	Pearson: 0.9278	Spearman: 0.8453
2024-05-05 11:56:03 - Manhattan-Distance:	Pearson: 0.9172	Spearman: 0.8456
2024-05-05 11:56:03 - Euclidean-Distance:	Pearson: 0.9168	Spearman: 0.8437
2024-05-05 11:56:03 - Dot-Product-Similarity:	Pearson: 0.9272	Spearman: 0.8448
2024-05-05 11:56:20 - EmbeddingSimilarityEvaluator: Evaluating the model on the evaluator1 dataset in epoch 2 after 300 steps:
2024-05-05 

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

2024-05-05 11:56:43 - EmbeddingSimilarityEvaluator: Evaluating the model on the evaluator1 dataset in epoch 3 after 100 steps:
2024-05-05 11:56:44 - Cosine-Similarity :	Pearson: 0.9283	Spearman: 0.8456
2024-05-05 11:56:44 - Manhattan-Distance:	Pearson: 0.9182	Spearman: 0.8459
2024-05-05 11:56:44 - Euclidean-Distance:	Pearson: 0.9176	Spearman: 0.8438
2024-05-05 11:56:44 - Dot-Product-Similarity:	Pearson: 0.9277	Spearman: 0.8451
2024-05-05 11:56:44 - Save model to model/training_nli_distilbert-model
2024-05-05 11:57:02 - EmbeddingSimilarityEvaluator: Evaluating the model on the evaluator1 dataset in epoch 3 after 200 steps:
2024-05-05 11:57:04 - Cosine-Similarity :	Pearson: 0.9292	Spearman: 0.8456
2024-05-05 11:57:04 - Manhattan-Distance:	Pearson: 0.9192	Spearman: 0.8461
2024-05-05 11:57:04 - Euclidean-Distance:	Pearson: 0.9186	Spearman: 0.8439
2024-05-05 11:57:04 - Dot-Product-Similarity:	Pearson: 0.9287	Spearman: 0.8450
2024-05-05 11:57:04 - Save model to model/training_nli_distilbert-

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

2024-05-05 11:57:44 - EmbeddingSimilarityEvaluator: Evaluating the model on the evaluator1 dataset in epoch 4 after 100 steps:
2024-05-05 11:57:45 - Cosine-Similarity :	Pearson: 0.9293	Spearman: 0.8453
2024-05-05 11:57:45 - Manhattan-Distance:	Pearson: 0.9191	Spearman: 0.8459
2024-05-05 11:57:45 - Euclidean-Distance:	Pearson: 0.9186	Spearman: 0.8439
2024-05-05 11:57:45 - Dot-Product-Similarity:	Pearson: 0.9287	Spearman: 0.8448
2024-05-05 11:58:03 - EmbeddingSimilarityEvaluator: Evaluating the model on the evaluator1 dataset in epoch 4 after 200 steps:
2024-05-05 11:58:04 - Cosine-Similarity :	Pearson: 0.9288	Spearman: 0.8453
2024-05-05 11:58:04 - Manhattan-Distance:	Pearson: 0.9183	Spearman: 0.8459
2024-05-05 11:58:04 - Euclidean-Distance:	Pearson: 0.9179	Spearman: 0.8439
2024-05-05 11:58:04 - Dot-Product-Similarity:	Pearson: 0.9282	Spearman: 0.8447
2024-05-05 11:58:23 - EmbeddingSimilarityEvaluator: Evaluating the model on the evaluator1 dataset in epoch 4 after 300 steps:
2024-05-05 

## Now try the trained model:

In [30]:
output_path='model/Final_model'

new_model = SentenceTransformer(output_path).to(device)

2024-05-07 16:55:55 - Load pretrained SentenceTransformer: model/Final_model
2024-05-07 16:55:56 - Use pytorch device_name: cuda


In [31]:
names_genres_keywords_all = movies['title'] + ',' + movies['AllGenres'] + ',' + movies['AllKeywords']
names_genres_keywords_all = names_genres_keywords_all.tolist()

new_embedding = model.encode(names_genres_keywords_all, show_progress_bar=True)
new_embedding = np.array(new_embedding)

Batches:   0%|          | 0/1420 [00:00<?, ?it/s]

In [32]:
recommend_movies_with_index(movies, new_embedding, 6790, k=5)

The movie 6790: Love Actually 	 Genres: Comedy,Romance,Drama

Top 5 recommendations:
8235.  It's All About Love                 Genres: Drama,Romance,Science Fiction,Thriller
12081. Lust, Caution                       Genres: Action,Drama,Romance,Thriller
8797.  Love at First Bite                  Genres: Romance,Comedy,Horror
7282.  Laws of Attraction                  Genres: Action,Comedy,Romance,Thriller
40797. Signed, Sealed, Delivered           Genres: Comedy,Drama,Romance,TV Movie


In [33]:
inputs = ['Calling Dr. Death', 'Rambo III', 'Morituri', 'Farscape: The Peacekeeper Wars']
recommend_movies_with_titles(inputs, movies, new_embedding, model_=new_model, k=5)

Top 5 recommendations:
44139. Ghost Recon: Alpha                  Genres: Action,Science Fiction,Thriller,War
5937.  Enigma                              Genres: Adventure,Drama,Action,Thriller,Foreign
37433. Beyond                              Genres: Romance,Science Fiction,Drama
32575. Deathline                           Genres: Action,Drama,Science Fiction,Thriller
44217. Riding with Death                   Genres: Science Fiction,TV Movie,Action,Thriller


In [34]:
UserId = '17'
recommend_movies_with_history(UserId, ratings_for_history, movies, new_embedding, k=5)

Top 5 recommendations:
28518. Arnold                              Genres: Thriller,Horror,Comedy,Mystery
35023. The Hollow                          Genres: Horror,TV Movie
19579. The Screaming Skull                 Genres: Thriller,Horror
7565.  Bedlam                              Genres: Horror,Drama,Thriller
43150. Eloise                              Genres: Horror,Mystery,Thriller
