In [None]:
import os

import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [None]:
!wget https://github.com/ALKONDR/netology-recsys/archive/refs/heads/master.zip
!unzip master.zip

--2023-11-24 16:56:25--  https://github.com/ALKONDR/netology-recsys/archive/refs/heads/master.zip
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/ALKONDR/netology-recsys/zip/refs/heads/master [following]
--2023-11-24 16:56:26--  https://codeload.github.com/ALKONDR/netology-recsys/zip/refs/heads/master
Resolving codeload.github.com (codeload.github.com)... 140.82.121.9
Connecting to codeload.github.com (codeload.github.com)|140.82.121.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘master.zip’

master.zip              [ <=>                ]   1.06M  6.09MB/s    in 0.2s    

2023-11-24 16:56:26 (6.09 MB/s) - ‘master.zip’ saved [1111929]

Archive:  master.zip
dfe2a910caf170a1f0fd2174867169ce737c9dc7
   creating: netology-recsys-master/
   creating: netology-recsy

In [None]:
prefix = 'netology-recsys-master/lecture-1'

links = pd.read_csv(os.path.join(prefix, 'links.csv'))
movies = pd.read_csv(os.path.join(prefix, 'movies.csv'))
ratings = pd.read_csv(os.path.join(prefix, 'ratings.csv'))
tags = pd.read_csv(os.path.join(prefix, 'tags.csv'))

In [None]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [None]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [None]:
# этот способ быстрее
movie_genres = movies.genres.apply(change_string).tolist()

In [None]:
# movie_genres = [change_string(g) for g in movies.genres.values]

In [None]:
movie_genres[:10]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller']

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

In [None]:
X_train_counts.todense()[0]

matrix([[0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
X_train_counts.todense()[1]

matrix([[0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
X_train_counts.todense()[0].shape

(1, 20)

In [None]:
movie_genres[0]

In [None]:
# чтобы понять, вхождение какого тега стоит за каждой координатой
count_vect.vocabulary_

{'adventure': 1,
 'animation': 2,
 'children': 3,
 'comedy': 4,
 'fantasy': 8,
 'romance': 15,
 'drama': 7,
 'action': 0,
 'crime': 5,
 'thriller': 17,
 'horror': 10,
 'mystery': 13,
 'scifi': 16,
 'war': 18,
 'musical': 12,
 'documentary': 6,
 'imax': 11,
 'western': 19,
 'filmnoir': 9,
 'nogenreslisted': 14}

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [None]:
neigh = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean')
neigh.fit(X_train_tfidf)

In [None]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [None]:
res[1][0]

array([6774, 9096, 3582,  863, 3576, 3376, 2302])

In [None]:
"Adventure|Comedy|Fantasy|Crime"

In [None]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
6774,60074,Hancock (2008),Action|Adventure|Comedy|Crime|Fantasy
9096,143559,L.A. Slasher (2015),Comedy|Crime|Fantasy
3582,4911,Jabberwocky (1977),Adventure|Comedy|Fantasy
863,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy
3576,4899,Black Knight (2001),Adventure|Comedy|Fantasy
3376,4591,Erik the Viking (1989),Adventure|Comedy|Fantasy
2302,3052,Dogma (1999),Adventure|Comedy|Fantasy


In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [None]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [None]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


In [None]:
movies_with_tags[movies_with_tags.title == 'Toy Story (1995)']

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0


In [None]:
movies_with_tags['tag'].unique().shape

(1590,)

In [None]:
movies_with_tags.dropna(inplace=True)

In [None]:
movies_with_tags.title.unique().shape

(1572,)

In [None]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


In [None]:
tag_strings = []
movies = []

for movie, group in tqdm_notebook(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies.append(movie)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for movie, group in tqdm_notebook(movies_with_tags.groupby('title')):


  0%|          | 0/1572 [00:00<?, ?it/s]

In [None]:
movies_tags_filtered = pd.DataFrame(
    {
        "movie": movies,
        "tag": tag_strings,
    }
)

In [None]:
movies_tags_filtered.head()

Unnamed: 0,movie,tag
0,(500) Days of Summer (2009),artistic Funny humorous inspiring intelligent ...
1,...And Justice for All (1979),lawyers
2,10 Cloverfield Lane (2016),creepy suspense
3,10 Things I Hate About You (1999),Shakespearesortof
4,101 Dalmatians (1996),dogs remake


In [None]:
movies_tags_filtered[movies_tags_filtered.movie.str.contains('Toy')].iloc[0]['tag']

'pixar pixar fun'

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movies_tags_filtered.tag.values)

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [None]:
neigh = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='euclidean')
neigh.fit(X_train_tfidf)

In [None]:
test = change_string('L.A.')

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [None]:
for i in res[1][0]:
    print(movies[i], tag_strings[i])

Magnolia (1999) L.A.
In a Lonely Place (1950) L.A.
Pulp Fiction (1994) gooddialogue greatsoundtrack nonlinear cultfilm drugs QuentinTarantino Tarantino hitmen 1990s achronological action actionpacked aggressive amazing amazingdialogues anthology assassin atmospheric AWESOME badass badlanguage badass bible biblicalreferences bigboyswithguns bignameactors Blackcomedy blackhumor blackhumour blood bloodsplatters bloody brucewillis brutality casualviolence characterdevelopment characters classic classicmovie coke comedy conversation cool coolstyle crime crimescenescrubbing cult cultclassic cultfilm dance dancing dark darkcomedy darkhumor dialogue different diner disjointedtimeline disturbing drama drugoverdose drugs drugs&music ensemblecast entertaining entirelydialogue episodic exciting fastpaced fastpaced filmnoir filmnoir foullanguage fun funny gangster gangsters genius goldenwatch gooddialogue goodmusic gore greatacting greatdialogue greatsoundtrack gritty guns HarveyKeitel heroin Highl

In [None]:
test = change_string('pixar|pixar|fun')

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [None]:
for i in res[1][0]:
    print(movies[i], tag_strings[i])

Toy Story (1995) pixar pixar fun
Bug's Life, A (1998) Pixar
Toy Story 2 (1999) animation Disney funny original Pixar sequel TomHanks Pixar
Magnolia (1999) L.A.
In a Lonely Place (1950) L.A.
Guardians of the Galaxy 2 (2017) fun
Up (2009) adventure BechdelTest:Fail cartoon children computeranimation divorce dogs dreams Pixar bittersweet emotional heartbreaking touching
Avengers, The (2012) CaptainAmerica silly superhero superheroteam fun greathumor visuallyappealing
The Lego Movie (2014) cheeky clever colorful feelgood fun imaginative quirky
Pulp Fiction (1994) gooddialogue greatsoundtrack nonlinear cultfilm drugs QuentinTarantino Tarantino hitmen 1990s achronological action actionpacked aggressive amazing amazingdialogues anthology assassin atmospheric AWESOME badass badlanguage badass bible biblicalreferences bigboyswithguns bignameactors Blackcomedy blackhumor blackhumour blood bloodsplatters bloody brucewillis brutality casualviolence characterdevelopment characters classic classicmo