In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.metrics.pairwise import cosine_distances
from tqdm import tqdm_notebook

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

#print("Sentence embeddings:")
#print(sentence_embeddings)

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/86.7M [00:00<?, ?B/s]

In [3]:

#tokenizer.save_pretrained("./models/tokenizer/")
#tokenizer2 = AutoTokenizer.from_pretrained("./models/tokenizer/")

#model.save_pretrained("./models/model/")
#model2.AutoModel.from_pretrained("./models/model/")

In [4]:
from scipy import spatial

sentences = ['This is the second sentence', 'This is the first sentence']

encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)


my_rho = np.correlate(sentence_embeddings[0].numpy(), sentence_embeddings[1].numpy())
my_cos = 1 - spatial.distance.cosine(sentence_embeddings[0].numpy().reshape(-1, 1), sentence_embeddings[1].numpy().reshape(-1, 1))
my_cos_sklearn = 1-cosine_distances(sentence_embeddings[0].numpy().reshape(1, -1), sentence_embeddings[1].numpy().reshape(1, -1))
print(my_rho, my_cos, my_cos_sklearn)

[0.887133] 0.8871327042579651 [[0.88713276]]


In [5]:
#sentence_embeddings[1].numpy().reshape(1,-1)

In [6]:
#model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
#embeddings = model.encode(sentences)
#print(embeddings)

In [7]:
data = pd.read_csv("../input/imdb-dataset-of-top-1000-movies-and-tv-shows/imdb_top_1000.csv")
data = data[["Series_Title", "Genre", "Overview"]]
data.head()

Unnamed: 0,Series_Title,Genre,Overview
0,The Shawshank Redemption,Drama,Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",An organized crime dynasty's aging patriarch t...
2,The Dark Knight,"Action, Crime, Drama",When the menace known as the Joker wreaks havo...
3,The Godfather: Part II,"Crime, Drama",The early life and career of Vito Corleone in ...
4,12 Angry Men,"Crime, Drama",A jury holdout attempts to prevent a miscarria...


In [8]:
# Предскажем рекомендации для конкретного фильма
pred_item = 786
print(data.iloc[pred_item, 0])
print(data.iloc[pred_item, 2])

overview_list = list(data["Overview"])
pred_item_overview  = overview_list[pred_item]

pair_list = [[item, pred_item_overview] for item in overview_list]
overview_tokens = tokenizer(pair_list, padding=True, truncation=True, return_tensors='pt')

Good Bye Lenin!
In 1990, to protect his fragile mother from a fatal shock after a long coma, a young man must keep her from learning that her beloved nation of East Germany as she knew it has disappeared.


In [9]:
overview_tokens = list(map(lambda a: tokenizer(a, padding=True, truncation=True, return_tensors='pt'), pair_list))

In [10]:
# Посчитаем косинусное расстояние между векторами эмбедингов текстов описания и эмбедингом описния заданного фильма.

rho_list = []

for encoded_input in tqdm_notebook(overview_tokens):

    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

# sklearn.metrics.pairwise.cosine_distances(X, Y=None)[source]
    my_rho = np.correlate(sentence_embeddings[0].numpy(), sentence_embeddings[1].numpy())
   

    rho_list.append(my_rho)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/1000 [00:00<?, ?it/s]

In [11]:
rho_list_meanings = [item[0] for item in rho_list]
data["rho_"+str(pred_item)] = rho_list_meanings
data.sort_values("rho_"+str(pred_item), ascending=False)#.tail(20)

Unnamed: 0,Series_Title,Genre,Overview,rho_786
786,Good Bye Lenin!,"Comedy, Drama, Romance","In 1990, to protect his fragile mother from a ...",1.000000
282,Zerkalo,"Biography, Drama",A dying man in his forties remembers his past....,0.468973
936,Un long dimanche de fiançailles,"Drama, Mystery, Romance",Tells the story of a young woman's relentless ...,0.458085
897,Philomena,"Biography, Comedy, Drama",A world-weary political journalist picks up th...,0.431673
938,21 Grams,"Crime, Drama, Thriller",A freak accident brings together a critically ...,0.431545
...,...,...,...,...
823,Glengarry Glen Ross,"Crime, Drama, Mystery",An examination of the machinations behind the ...,-0.031893
512,Nueve reinas,"Crime, Drama, Thriller",Two con artists try to swindle a stamp collect...,-0.034521
635,Walk the Line,"Biography, Drama, Music",A chronicle of country music legend Johnny Cas...,-0.034901
441,The Killing,"Crime, Drama, Film-Noir",Crook Johnny Clay assembles a five man team to...,-0.041132
