In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate



In [180]:
info = pd.read_csv('info.csv')

# по описанию + TF-IDF

In [181]:
info['description'].fillna(' ', inplace=True)

In [182]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(info['description'])

In [183]:
cosine_sim1 = linear_kernel(tfidf_matrix, tfidf_matrix)

In [184]:
info = info.reset_index()
titles = info['title']
indices = pd.Series(info.index, index=info['title'])

In [186]:
def get_recommendations(title,cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [187]:
get_recommendations('My Neighbor Totoro (Tonari no Totoro) (1988)',cosine_sim1).head(10)

1088                        Whole Wide World, The (1996)
9289                               World of Glory (1991)
5446                                   Pickpocket (1959)
2753                               Private School (1983)
1617                       NeverEnding Story, The (1984)
3661                                 Dragonslayer (1981)
9512                                     The Hero (2017)
1018    Blood for Dracula (Andy Warhol's Dracula) (1974)
8844                    Girltrash: All Night Long (2014)
8674            Stuart Little 3: Call of the Wild (2005)
Name: title, dtype: object

Нет мультфильмов, вообще не очень похоже на исходный фильм. Мне бы не понравились такие рекомендации.

# по тегам, жанрам, актёрам и режиссёру

In [188]:
tags = pd.read_csv('tags.csv')

In [189]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [190]:
info['tags'] = pd.Series([np.NaN] * len(info), index=info.index)

In [191]:
for index, row in tags.iterrows():
    movie_id = row['movieId']
    tag = row['tag']
    if pd.isna(info.loc[info['movieId']==movie_id, 'tags']).iloc[0]:  
        info.loc[info['movieId']==movie_id, 'tags'] = tag  
    else:
        info.loc[info['movieId']==movie_id, 'tags'] = info.loc[info['movieId']==movie_id, 'tags'] + ', ' + tag  


In [192]:
info['actors'] = info['actors'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))

In [193]:
info['director'] = info['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
info['director'] = info['director'].apply(lambda x: ','.join([x, x,x]))

In [194]:
info['tags'] = info['tags'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))

In [195]:
info['genres'] = info['genres'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))

In [197]:
info['full_tags'] = info['actors'] + info['director'] + info['tags'] + info['genres'] 


In [198]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(info['full_tags'])

In [199]:
info['full_tags'][0]

'tomhanks,timallen,donrickles,jimvarneyjohnlasseter,johnlasseter,johnlasseterpixar,pixar,funadventure,animation,children,comedy,fantasy'

In [200]:
count_matrix.toarray()[0].sum()

23

In [201]:
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [203]:
info = info.reset_index()
titles = info['title']
indices = pd.Series(info.index, index=info['title'])

In [204]:
get_recommendations('My Neighbor Totoro (Tonari no Totoro) (1988)',cosine_sim2).head(10)

5546    Kiki's Delivery Service (Majo no takkyûbin) (1...
6014                                    MirrorMask (2005)
7161                     Where the Wild Things Are (2009)
7181                            Christmas Carol, A (2009)
6685                    Spiderwick Chronicles, The (2008)
6411                                   The Odyssey (1997)
6946                 Earthsea (Legend of Earthsea) (2004)
7601                             Idiots and Angels (2008)
4769    Nausicaä of the Valley of the Wind (Kaze no ta...
1596                                Watership Down (1978)
Name: title, dtype: object

Здесь уже есть несколько детских мультфильмов, среди которых даже есть работы Миядзаки.

# по описанию и всей остальной информации с помощью SBERT

In [207]:
# скинула всё, что есть по фильмам в одну кучу
info['trash'] = info['description'] + info['full_tags']
info['trash'] = info['trash'].astype('str').apply(lambda x: str.lower(x))

In [208]:
info['trash'] 

0       a cowboy doll is profoundly threatened and jea...
1       when two kids find and play a magical board ga...
2       john and max resolve to save their beloved bai...
3       based on terry mcmillan's novel, this film fol...
4       george banks must deal not only with his daugh...
                              ...                        
9737    a young lord and his demon butler board a luxu...
9738    adaption of the sixth light novel of series, i...
9739    a woman deals with the toxic water scandal in ...
9740    the armed detective agency investigates a biza...
9741    outrageous, misogynistic and vulgar-to-the-max...
Name: trash, Length: 9742, dtype: object

In [209]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


embeddings = model.encode(info['trash'], show_progress_bar=True)

cosine_sim3 = cosine_similarity(embeddings, embeddings)

Batches:   0%|          | 0/305 [00:00<?, ?it/s]

In [211]:
get_recommendations('My Neighbor Totoro (Tonari no Totoro) (1988)',cosine_sim3).head(10)

3984    Spirited Away (Sen to Chihiro no kamikakushi) ...
7607                               Mothra (Mosura) (1961)
9239                                Tears for Sale (2008)
2771                                  Sleepwalkers (1992)
5546    Kiki's Delivery Service (Majo no takkyûbin) (1...
1943    Dreamlife of Angels, The (Vie rêvée des anges,...
9572                         Belladonna of Sadness (1973)
7569                                     Insidious (2010)
2941                            Digimon: The Movie (2000)
3543                        Fat Girl (À ma soeur!) (2001)
Name: title, dtype: object

Также есть работы Миядзаки и мультфильмы - нормально..