In [1]:
import random
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import torch
from torch.utils.data import Dataset, DataLoader, random_split
from sentence_transformers import SentenceTransformer, InputExample, losses, models, datasets, evaluation, util

#### 1. Input a movie index, return recommendations:

In [2]:
def recommend_movies_with_index(movies_, embeddings, movie_index, k=5):
    if movie_index >= movies_.shape[0]:
        print(f'Invalid movie index {movie_index}!')
        return
    
    k = 5 if k <= 0 else k
    
    movie_emb = embeddings[movie_index]  # find the corresponding embedding vector first 
    cos_simi = cosine_similarity([movie_emb], embeddings)[0]  # compute the similarity to all movies
    
    top_indices = np.argsort(-cos_simi)[1:k+1]  # because the largest cosine similarity must be with itself
    rec_movies = movies_.iloc[top_indices]  # get the recommendation movies -> DataFrame

    print(f'The movie {movie_index}: {movies_.title.iloc[movie_index]} \t Genres: {movies_.AllGenres.iloc[movie_index]}\n')
    print(f'Top {k} recommendations:')
    
    for index, movie in rec_movies.iterrows():
        print('{:<5} {:<35} {}'.format(f'{index}.', f'{movie["title"]}', f'Genres: {movie["AllGenres"]}'))

#### 2. Input a list of movie(s), return recommendations: 

In [39]:
def recommend_movies_with_titles(titles, movies_, embeddings, model_, k=5):
    if not titles:
        print('Enter at least ONE movie title!')
        return 
    
    k = 5 if k <= 0 else k
    
    movies_input = movies_[movies_['title'].isin(titles)]  # All input movies
    
    # If ANY movie title that the user enters is NOT in our dataset
    # The only useful information we have is those titles -> encode those titles
    if movies_input.empty:
        print('Those movies are not in our dataset currently, but we can recommend you the following: \n')
        
        movies_remaining = movies_  # Every movie in the dataset is the "remaining movie"
        
        user_inputs = ','.join(titles)
        movies_input_emb = model_.encode(user_inputs)
        combined_emb = np.array(movies_input_emb)
        
    # If the user enters valid titles, then just get the embeddings for the movies
    else:
        movies_remaining = movies_[~movies_['title'].isin(titles)]  # All remaining movies
    
        movies_input_emb = embeddings[movies_input.index]  # embeddings is np.array
        combined_emb = np.mean(movies_input_emb, axis=0)
    
    movies_remaining_emb = embeddings[movies_remaining.index]
    
    cos_simi = cosine_similarity([combined_emb], movies_remaining_emb)[0]
    
    top_indices = np.argsort(-cos_simi)[:k]
    recommend_movies = movies_remaining.iloc[top_indices]
    
    print(f'Top {k} recommendations:')
    for index, movie in recommend_movies.iterrows():
        print('{:<5} {:<35} {}'.format(f'{index}.', f'{movie["title"]}', f'Genres: {movie["AllGenres"]}'))

#### 3. Input User ID, return recommendations:

In [35]:
def recommend_movies_with_history(user_id, ratings_, movies_, embeddings, k=5, m=3):
    user_list = ratings_['userId'].unique().tolist()
    if user_id not in user_list:
        print(f'Cannot find the user with id: {user_id}')
        return 
    
    user_history = ratings_[ratings_['userId'] == user_id]  # get the user's history (all)
    
    m = 3 if m <= 0 else m 
    m = min(m, len(user_history))
    
    # Sort movies with the highest user ratings and recently watched
    sorted_history = user_history.sort_values(by=['rating', 'timestamp'])
    sorted_m_history = sorted_history.iloc[:m]  # select the first m movies
    
    user_history_id = sorted_m_history['movieId'].tolist()
    
    history_titles = movies_[movies_['id'].isin(user_history_id)]
    history_titles = history_titles['title'].tolist()
    
    recommend_movies_with_titles(history_titles, movies_, embeddings, k)

### Additional: 
#### 4. From genres to movies

In [48]:
def genres_to_movies(genres_, model_, movies_, embeddings, k=5):
    if not genres_:
        print('Enter at least ONE genre!')
        return
    
    k = 5 if k <= 0 else k
    
    all_genres = ','.join(genres_)
    all_genres_emb = model_.encode(all_genres)
    all_genres_emb = np.array(all_genres_emb)
    
    cos_simi = cosine_similarity([all_genres_emb], embeddings)[0]
    
    top_indices = np.argsort(-cos_simi)[:k]
    recommend_movies = movies_.iloc[top_indices]
    
    print(f'Top {k} recommendations:')
    for index, movie in recommend_movies.iterrows():
        print('{:<5} {:<35} {}'.format(f'{index}.', f'{movie["title"]}', f'Genres: {movie["AllGenres"]}'))
        
    return recommend_movies['title'].tolist()

#### Load the datasets:

In [7]:
movies_all_path = 'dataset/movies_with_keywords.csv'
movies_small_path = 'dataset/movies_5000.csv'
ratings_for_history_path = 'dataset/ratings_for_history.csv'
ratings_for_history_small_path = 'dataset/ratings_for_history_small.csv'

movies = pd.read_csv(movies_all_path, keep_default_na=False, dtype=str)
movies_5000 = pd.read_csv(movies_small_path, keep_default_na=False, dtype=str)
ratings_for_history = pd.read_csv(ratings_for_history_path, dtype=str)
ratings_for_history_small = pd.read_csv(ratings_for_history_small_path, dtype=str)

movies.head()

Unnamed: 0,id,title,AllGenres,AllKeywords
0,862,Toy Story,"Animation,Comedy,Family","rivalry,toy comes to life,boy next door,toy"
1,8844,Jumanji,"Adventure,Fantasy,Family",
2,15602,Grumpier Old Men,"Romance,Comedy","fishing,best friend,duringcreditsstinger,old men"
3,31357,Waiting to Exhale,"Comedy,Drama,Romance","divorce,chick flick,interracial relationship,s..."
4,11862,Father of the Bride Part II,Comedy,"mother daughter relationship,baby,aging,daughter"


In [8]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45429 entries, 0 to 45428
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           45429 non-null  object
 1   title        45429 non-null  object
 2   AllGenres    45429 non-null  object
 3   AllKeywords  45429 non-null  object
dtypes: object(4)
memory usage: 1.4+ MB


#### Load the model, and encode the words (descriptions -> titles + genres + keywords):

In [9]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = SentenceTransformer('distilbert-base-nli-mean-tokens').to(device)
print(device)

cuda:0


In [10]:
names_genres_keywords = movies_5000['title'] + ',' + movies_5000['AllGenres'] + ',' + movies_5000['AllKeywords']
names_genres_keywords = names_genres_keywords.tolist()
names_genres_keywords[:5]

['Grumpier Old Men,Romance,Comedy,fishing,best friend,duringcreditsstinger,old men',
 'Waiting to Exhale,Comedy,Drama,Romance,divorce,chick flick,interracial relationship,single mother',
 'GoldenEye,Adventure,Action,Thriller,red army,electromagnetic pulse,special car,computer virus',
 'Money Train,Action,Comedy,Crime,subway,brother brother relationship,new york subway,train robbery',
 'Twelve Monkeys,Science Fiction,Thriller,Mystery,monkey,dystopia,stockholm syndrome,subterranean']

In [11]:
embedding = model.encode(names_genres_keywords, show_progress_bar=True)
embedding = np.array(embedding)

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

#### Test my functions:

In [52]:
recommend_movies_with_index(movies_5000, embedding, 888, 5)

The movie 888: The Manchurian Candidate 	 Genres: Drama,Thriller,Mystery

Top 5 recommendations:
2405. Calling Dr. Death                   Genres: Thriller,Mystery
4413. Psycho-Pass: The Movie              Genres: Science Fiction,Animation,Action
261.  Rambo III                           Genres: Action,Adventure,Thriller,War
875.  Morituri                            Genres: Action,Drama,Thriller,War
1451. Farscape: The Peacekeeper Wars      Genres: Action,Comedy,Romance,Science Fiction,Thriller


In [53]:
inputs = ['Calling Dr. Death', 'Rambo III', 'Morituri', 'Farscape: The Peacekeeper Wars']
recommend_movies_with_titles(inputs, movies_5000, embedding, model_=model, k=5)

Top 5 recommendations:
4858. Riding with Death                   Genres: Science Fiction,TV Movie,Action,Thriller
1860. The Kings of Mykonos                Genres: Action,Comedy,Thriller,Science Fiction
888.  The Manchurian Candidate            Genres: Drama,Thriller,Mystery
4675. The Tribe                           Genres: Thriller,Drama,Science Fiction
3485. Spectral                            Genres: Thriller,Action,Science Fiction


In [56]:
inputs_2 = ['A movie I just made up', 'Only for the test']
recommend_movies_with_titles(inputs_2, movies_5000, embedding, model_=model, k=5)

Those movies are not in our dataset currently, but we can recommend you the following: 

Top 5 recommendations:
4663. Has the Film Already Started?       Genres: 
1726. The Man Who Left His Will on Film   Genres: Drama
1900. The Target Shoots First             Genres: Documentary
3059. The Lumière Brothers' First Films   Genres: Documentary
2793. Mail Early for Xmas                 Genres: Animation


In [57]:
UserId = '17'
recommend_movies_with_history(UserId, ratings_for_history_small, movies_5000, embedding, k=5, m=10)

Top 5 recommendations:
1123. Ferpect Crime                       Genres: Thriller,Comedy,Crime
3724. Buttwhistle                         Genres: Mystery,Romance,Comedy,Crime,Drama
657.  Above Suspicion                     Genres: Drama,Thriller
4265. Captain Fantastic                   Genres: Adventure,Comedy,Drama,Romance
928.  Tales of Manhattan                  Genres: Comedy,Drama,Romance


In [58]:
genres = ['Thriller', 'Crime', 'Comedy', 'Romance']
rec_movies = genres_to_movies(genres, model_=model, movies_=movies_5000, embeddings=embedding, k=5)
rec_movies

Top 5 recommendations:
115.  Charade                             Genres: Comedy,Mystery,Romance,Thriller
85.   Captives                            Genres: Drama,Thriller,Mystery,Romance
4203. Rajathandhiram                      Genres: Romance,Crime,Drama,Thriller,Comedy
1949. Carancho                            Genres: Thriller,Drama,Romance
3337. Amiche da morire                    Genres: Romance,Crime,Comedy


['Charade', 'Captives', 'Rajathandhiram', 'Carancho', 'Amiche da morire']

## Above is just testing my functions.
## Now we can finally fine tune the model!!!

#### The function compute the similarity of two movies:
Only consider genres, because there are way too many different keywords. But of course, if two movies have same keywords, which implies a much stronger correlation than merely having similar genres.

This can be used for improvement in the future.

In [13]:
def if_similar(movie1, movie2):
    genres1 = set(movie1['AllGenres'].strip().split(','))
    genres2 = set(movie2['AllGenres'].strip().split(','))
    
    min_len = min(len(genres1), len(genres2))  # the minimum of two lengths
    if min_len == 0:
        return 0.1  # I think return 0.1 instead of 0.0 is more appropriate (it's just my silly thought, maybe is not) 
    
    common_genres = genres1 & genres2  # Find the overlapped genres
    common_genres_num = len(common_genres)
    
    similarity = (common_genres_num / float(min_len))
    
    return round(similarity, 2)

In [14]:
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

In [15]:
# class MyDataset(Dataset):
#     def __init__(self, data):
#         super(MyDataset, self).__init__()
#         self.data = data
# 
#     def __len__(self):
#         return len(self.data)
# 
#     def __getitem__(self, idx):
#         example = self.data[idx]
#         texts = (example.texts[0], example.texts[1])
#         label = example.label
#         return texts, label

#### Now generate data for training. Randomly selecting 100 movies can actually generate (99 + 0) x 100 / 2 = 4950 data points
By the way, I'm using the subset of movies here, and we can also use the full movies' dataset. It's the same thing, because the number of generated data points only related to the "movies_num" variable here.

In [16]:
movies_num = 100
sample_index = np.random.choice(range(len(movies_5000)), size=movies_num, replace=False)
sample_index = np.sort(sample_index)

examples = []
for i in sample_index:
    movie1 = movies.iloc[i]
    for j in sample_index:
        if j > i:
            movie2 = movies.iloc[j]
            similarity = if_similar(movie1, movie2)
            
            examples.append(InputExample(texts=[movie1['title'] + ',' + movie1['AllGenres'] + ',' +  movie1['AllKeywords'], movie2['title'] + ',' + movie2['AllGenres'] + ',' + movie2['AllKeywords']], label=similarity))

print(f'{len(examples)} data points in total')
print(examples[2024])

4950 data points in total
<InputExample> label: 0.0, texts: Glengarry Glen Ross,Crime,Drama,Mystery,company,real estate,shop,pressure; Escape from New York,Science Fiction,Action,reluctant hero,anti hero,police operation,hostage


In [17]:
text = examples[2024].texts[0]
text

'Glengarry Glen Ross,Crime,Drama,Mystery,company,real estate,shop,pressure'

In [18]:
my_dataset = MyDataset(examples)
train_loader = DataLoader(my_dataset, batch_size=16, shuffle=True)

print(len(train_loader))

310


In [19]:
# my_dataset = MyDataset(examples)
# 
# train_size = int(0.9 * len(my_dataset))
# val_size = len(my_dataset) - train_size
# 
# train_data, val_data = random_split(my_dataset, [train_size, val_size])
# 
# train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
# val_loader = DataLoader(val_data, batch_size=16, shuffle=False)

In [20]:
train_loss = losses.CosineSimilarityLoss(model=model)

model.fit(
    train_objectives=[(train_loader, train_loss)],
    epochs=10,
    warmup_steps=100,
    output_path='model/training_nli_distilbert-model'
)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

Iteration:   0%|          | 0/310 [00:00<?, ?it/s]

In [21]:
# loss_func = losses.CosineSimilarityLoss(model=model)
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
# 
# all_loss = []
# epochs = 15
# 
# for epoch in range(epochs):
#     model.train()
#     
#     for texts, labels in train_loader:
#         optimizer.zero_grad()
#         
#         texts1, texts2 = texts
#         embed1 = model.encode(list(texts1), convert_to_tensor=True, show_progress_bar=False).to(device)
#         embed2 = model.encode(list(texts2), convert_to_tensor=True, show_progress_bar=False).to(device)
# 
#         labels = torch.tensor(labels, dtype=torch.float).to(device)
#  
#         features = {'sentence_embedding0': embed1, 'sentence_embedding1': embed2}
#         loss = loss_func(features, labels)
#         
#         loss.backward()
#         optimizer.step()
#     
#     model.eval()
#     with torch.no_grad():
#         total_loss = 0
#         total = 0
#         
#         for texts, labels in train_loader:
#             texts1, texts2 = texts
#             embed1 = model.encode(list(texts1), convert_to_tensor=True, show_progress_bar=False).to(device)
#             embed2 = model.encode(list(texts2), convert_to_tensor=True, show_progress_bar=False).to(device)
# 
#             labels = torch.tensor(labels, dtype=torch.float).to(device)
# 
#             features = {'sentence_embedding0': embed1, 'sentence_embedding1': embed2}
#             val_loss = loss_func(features, labels)
#             
#             total_loss += val_loss.item() * len(labels)
#             total += len(labels)
#         
#         all_loss.append(total_loss / total)
#         print(f'Epoch: {epoch+1:02d}, Loss: {(total_loss / total):.4f}')
#     
#     if (epoch + 1) % 5 == 0:
#         torch.save(model.state_dict(), f'models/model_in_epoch_{epoch+1}.pth')  # save the model every 5 epochs
#         # later use "model.load_state_dict(torch.load('path_to_save_model.pth'))" to get the model
# 
# print('Finished Training')

## Now try the trained model:

In [59]:
output_path='model/training_nli_distilbert-model'

new_model = SentenceTransformer(output_path).to(device)

In [60]:
new_embedding = model.encode(names_genres_keywords, show_progress_bar=True)
new_embedding = np.array(new_embedding)

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [64]:
recommend_movies_with_index(movies_5000, new_embedding, 679, k=5)

The movie 679: The Lawless Heart 	 Genres: Drama,Comedy,Romance

Top 5 recommendations:
3238. Banning                             Genres: Romance,Drama
993.  The Night We Never Met              Genres: Romance,Comedy,Drama
4464. Freelance                           Genres: Comedy,Romance,Drama
1781. Heartbreaker                        Genres: Romance,Comedy
11.   Mr. Wrong                           Genres: Comedy,Romance


In [62]:
inputs = ['Calling Dr. Death', 'Rambo III', 'Morituri', 'Farscape: The Peacekeeper Wars']
recommend_movies_with_titles(inputs, movies_5000, embedding, model_=model, k=5)

Top 5 recommendations:
4858. Riding with Death                   Genres: Science Fiction,TV Movie,Action,Thriller
1860. The Kings of Mykonos                Genres: Action,Comedy,Thriller,Science Fiction
888.  The Manchurian Candidate            Genres: Drama,Thriller,Mystery
4675. The Tribe                           Genres: Thriller,Drama,Science Fiction
3485. Spectral                            Genres: Thriller,Action,Science Fiction


In [65]:
UserId = '17'
recommend_movies_with_history(UserId, ratings_for_history_small, movies_5000, embedding, 5)

Top 5 recommendations:
3049. Dark Dungeons                       Genres: Drama,Fantasy,Horror
1063. Boogeyman                           Genres: Thriller,Horror,Drama,Mystery
3779. Touch of Death                      Genres: Comedy,Horror
230.  The 'Burbs                          Genres: Comedy,Horror,Thriller
517.  Thir13en Ghosts                     Genres: Horror,Thriller
