In [1]:
import pandas as pd
import ast
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack, csr_matrix
from sklearn.cluster import MiniBatchKMeans

In [2]:
movies_metadata = pd.read_csv('datasets/movies_metadata.csv', low_memory=False)
keywords = pd.read_csv('datasets/keywords.csv', low_memory=False)

In [3]:
keywords[keywords.duplicated()]

Unnamed: 0,id,keywords
1465,105045,"[{'id': 7059, 'name': 'anti-communism'}, {'id'..."
9165,5511,"[{'id': 90, 'name': 'paris'}, {'id': 1010, 'na..."
9327,23305,[]
12066,14788,"[{'id': 9826, 'name': 'murder'}, {'id': 10183,..."
13375,141971,[]
...,...,...
40994,298721,"[{'id': 1227, 'name': 'cemetery'}, {'id': 1261..."
41230,97995,[]
45774,10991,"[{'id': 1262, 'name': 'mountain'}, {'id': 2802..."
45779,12600,"[{'id': 9663, 'name': 'sequel'}, {'id': 11451,..."


In [4]:
movies_metadata[movies_metadata.duplicated()]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
1465,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,105045,tt0111613,de,Das Versprechen,"East-Berlin, 1961, shortly after the erection ...",...,1995-02-16,0.0,115.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}]",Released,"A love, a hope, a wall.",The Promise,False,5.0,1.0
9165,False,,0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",,5511,tt0062229,fr,Le Samouraï,Hitman Jef Costello is a perfectionist who alw...,...,1967-10-25,39481.0,105.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,There is no solitude greater than that of the ...,Le Samouraï,False,7.9,187.0
9327,False,,0,"[{'id': 12, 'name': 'Adventure'}, {'id': 16, '...",,23305,tt0295682,en,The Warrior,"In feudal India, a warrior (Khan) who renounce...",...,2001-09-23,0.0,86.0,"[{'iso_639_1': 'hi', 'name': 'हिन्दी'}]",Released,,The Warrior,False,6.3,15.0
13375,False,,0,"[{'id': 53, 'name': 'Thriller'}, {'id': 9648, ...",,141971,tt1180333,fi,Blackout,Recovering from a nail gun shot to the head an...,...,2008-12-26,0.0,108.0,"[{'iso_639_1': 'fi', 'name': 'suomi'}]",Released,Which one is the first to return - memory or t...,Blackout,False,6.7,3.0
16764,False,,0,"[{'id': 53, 'name': 'Thriller'}, {'id': 9648, ...",,141971,tt1180333,fi,Blackout,Recovering from a nail gun shot to the head an...,...,2008-12-26,0.0,108.0,"[{'iso_639_1': 'fi', 'name': 'suomi'}]",Released,Which one is the first to return - memory or t...,Blackout,False,6.7,3.0
21165,False,,0,"[{'id': 14, 'name': 'Fantasy'}, {'id': 18, 'na...",,119916,tt0080000,en,The Tempest,"Prospero, the true Duke of Milan is now living...",...,1980-02-27,0.0,123.0,[],Released,,The Tempest,False,0.0,0.0
21854,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 878, 'nam...",,152795,tt1821641,en,The Congress,More than two decades after catapulting to sta...,...,2013-05-16,455815.0,122.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,The Congress,False,6.4,165.0
22151,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",http://www.daysofdarknessthemovie.com/,18440,tt0499456,en,Days of Darkness,When a comet strikes Earth and kicks up a clou...,...,2007-01-01,0.0,89.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Days of Darkness,False,5.0,5.0
23044,False,,0,"[{'id': 18, 'name': 'Drama'}]",,25541,tt1327820,da,Broderskab,Former Danish servicemen Lars and Jimmy are th...,...,2009-10-21,0.0,90.0,"[{'iso_639_1': 'da', 'name': 'Dansk'}]",Released,,Brotherhood,False,7.1,21.0
24844,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",http://www.dealthemovie.com/,11115,tt0446676,en,Deal,As an ex-gambler teaches a hot-shot college ki...,...,2008-01-29,0.0,85.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Deal,False,5.2,22.0


In [5]:
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'], errors='coerce')
keywords['id'] = pd.to_numeric(keywords['id'], errors='coerce')

movies_metadata = movies_metadata.dropna(subset=['id'])
keywords = keywords.dropna(subset=['id'])

movies_metadata['id'] = movies_metadata['id'].astype(int)
keywords['id'] = keywords['id'].astype(int)

movies_metadata = movies_metadata.drop_duplicates()
keywords = keywords.drop_duplicates()

movies = movies_metadata.merge(keywords, on='id', how='left') 
movies

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45441,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0,"[{'id': 10703, 'name': 'tragic love'}]"
45442,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0,"[{'id': 2679, 'name': 'artist'}, {'id': 14531,..."
45443,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0,[]
45444,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0,[]


In [6]:
movies['popularity'] = pd.to_numeric(movies['popularity'], errors='coerce')
movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')
movies['release_year'] = movies['release_date'].dt.year
movies = movies.drop('release_date', axis=1)

cols_to_scale = ['vote_average', 'popularity', 'runtime', 'release_year']

for col in cols_to_scale:
    median_val = movies[col].median()
    movies[col] = movies[col].fillna(median_val)

scaler = StandardScaler()

movies[cols_to_scale] = scaler.fit_transform(movies[cols_to_scale])
movies

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords,release_year
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,-0.342834,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,1.081941,5415.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",0.128966
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,0.257656,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,0.666142,2413.0,"[{'id': 10090, 'name': 'board game'}, {'id': 1...",0.128966
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.179332,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,0.458243,92.0,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",0.128966
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,0.858146,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,0.250344,34.0,"[{'id': 818, 'name': 'based on novel'}, {'id':...",0.128966
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,0.309873,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,0.042445,173.0,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",0.128966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45441,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,-0.107859,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,-0.841127,1.0,"[{'id': 10703, 'name': 'tragic love'}]",0.378624
45442,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,6.941370,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,1.757613,3.0,"[{'id': 2679, 'name': 'artist'}, {'id': 14531,...",0.794720
45443,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,-0.107859,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,-0.945076,6.0,[],0.461843
45444,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,-0.186184,[],Released,,Satan Triumphant,False,-2.920119,0.0,[],-3.116588


In [7]:
movies['genres'] = movies['genres'].apply(ast.literal_eval)
movies['keywords'] = movies['keywords'].fillna('[]').apply(ast.literal_eval)

In [8]:
genres_count = (
    movies['genres']
    .explode()
    .dropna()
    .apply(lambda x: x.get('name') if isinstance(x, dict) else None)
    .dropna()
    .value_counts()
)

genres = genres_count[genres_count > 1].index.to_list()

genres_sets = movies['genres'].apply(
    lambda genres: {genre.get('name') for genre in genres
                    if isinstance(genre, dict) and genre.get('name')
                }
)

for g in genres:
    movies[f'genre_{g}'] = genres_sets.apply(lambda genre: int(g in genre))

movies['genres'] = genres_sets
movies

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,genre_Family,genre_Mystery,genre_Fantasy,genre_Animation,genre_Foreign,genre_Music,genre_History,genre_War,genre_Western,genre_TV Movie
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"{Animation, Family, Comedy}",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1,0,0,1,0,0,0,0,0,0
1,False,,65000000,"{Adventure, Fantasy, Family}",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1,0,1,0,0,0,0,0,0,0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"{Comedy, Romance}",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0,0,0,0,0,0,0,0,0,0
3,False,,16000000,"{Romance, Drama, Comedy}",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,0,0,0,0,0,0,0,0,0,0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,{Comedy},,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45441,False,,0,"{Family, Drama}",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,1,0,0,0,0,0,0,0,0,0
45442,False,,0,{Drama},,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,0,0,0,0,0,0,0,0,0,0
45443,False,,0,"{Thriller, Action, Drama}",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,0,0,0,0,0,0,0,0,0,0
45444,False,,0,{},,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,0,0,0,0,0,0,0,0,0,0


In [9]:
movies['keywords'] = movies['keywords'].apply(
    lambda words: ' '.join([word['name'].lower() for word in words 
                            if isinstance(word, dict) and 'name' in word])
)

movies

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,genre_Family,genre_Mystery,genre_Fantasy,genre_Animation,genre_Foreign,genre_Music,genre_History,genre_War,genre_Western,genre_TV Movie
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"{Animation, Family, Comedy}",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1,0,0,1,0,0,0,0,0,0
1,False,,65000000,"{Adventure, Fantasy, Family}",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1,0,1,0,0,0,0,0,0,0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"{Comedy, Romance}",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0,0,0,0,0,0,0,0,0,0
3,False,,16000000,"{Romance, Drama, Comedy}",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,0,0,0,0,0,0,0,0,0,0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,{Comedy},,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45441,False,,0,"{Family, Drama}",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,1,0,0,0,0,0,0,0,0,0
45442,False,,0,{Drama},,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,0,0,0,0,0,0,0,0,0,0
45443,False,,0,"{Thriller, Action, Drama}",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,0,0,0,0,0,0,0,0,0,0
45444,False,,0,{},,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,0,0,0,0,0,0,0,0,0,0


___
____

In [10]:
stemmer = SnowballStemmer('english')
stop_words = set(stopwords.words('english'))

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ''
    
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = text.split()
    cleaned_words = [stemmer.stem(w) for w in words if w not in stop_words]

    return ' '.join(cleaned_words)

In [11]:
movies['overview'] = movies['overview'].fillna('')
movies['tagline'] = movies['tagline'].fillna('')
movies['title'] = movies['title'].fillna('')

movies['overview_clean'] = movies['overview'].apply(clean_text)
movies['tagline_clean'] = movies['tagline'].apply(clean_text)
movies['title_clean'] = movies['title'].apply(clean_text)

movies['soup'] = (
    movies['keywords'] + ' ' + 
    movies['overview_clean'] + ' ' + 
    movies['tagline_clean'] + ' ' +
    movies['title_clean']
)

movies['soup']

0        jealousy toy boy friendship friends rivalry bo...
1        board game disappearance based on children's b...
2        fishing best friend duringcreditsstinger old m...
3        based on novel interracial relationship single...
4        baby midlife crisis confidence aging daughter ...
                               ...                        
45441    tragic love rise fall man woman rise fall man ...
45442    artist play pinoy artist struggl finish work s...
45443     one hit goe wrong profession assassin end sui...
45444     small town live two brother one minist one hu...
45445     50 year decriminalis homosexu uk director dai...
Name: soup, Length: 45446, dtype: object

____
____

In [12]:
tfidf = TfidfVectorizer(
    max_features=10000,
    strip_accents='unicode',
    ngram_range=(1, 2)
)

tfidf_matrix = tfidf.fit_transform(movies['soup'])

In [13]:
genres_cols = [f'genre_{genre}' for genre in genres]
cols_to_add = genres_cols + cols_to_scale

additional_features = movies[cols_to_add].values
additional_features_sparse = csr_matrix(additional_features)
final_matrix = hstack([tfidf_matrix, additional_features_sparse])

___
___

In [14]:
n_clusters = 20
kmeans = MiniBatchKMeans(
    n_clusters=n_clusters, 
    random_state=42, 
    batch_size=2048,
    n_init=10
)

kmeans.fit(final_matrix)
movies['cluster'] = kmeans.labels_

In [18]:
for i in range(n_clusters):
    print(f"\n--- Cluster {i} ---")
    examples = movies[movies['cluster'] == i][['title', 'genres']].head(20)
    print(examples)


--- Cluster 0 ---
                               title                     genres
731          The Man from Down Under               {War, Drama}
838                             Wife                    {Drama}
1120                   Love In Bloom                         {}
3021             Tarzan the Fearless        {Adventure, Action}
3052                    Hi-Yo Silver                  {Western}
3213              The Legend of Lobo        {Family, Adventure}
3262                 Song of Freedom                         {}
3339   Hillbillys in a Haunted House    {Music, Horror, Comedy}
3656               Project Moon Base          {Science Fiction}
6364                  Ring of Terror         {Thriller, Horror}
7090        The Beast of Yucca Flats  {Horror, Science Fiction}
7478            Hallelujah I'm a Bum           {Music, Romance}
8301             The Man from Beyond                         {}
8313               Old San Francisco  {Romance, Drama, History}
8367           Revolt

##### Вывод:
Результаты кластеризации показывает, что комбинированный подход (TF-IDF по описаниям + метаданные) работает эффективно, создавая логичные группы фильмов, опираясь на два доминирующих фактора: жанры и год выпуска

Ключевые наблюдения:
- Временное разделение: Признак release_year сыграл одну из ключевых ролей. Фильмы 90х отделились от классики 30-50х, даже если у них схожие жанры. Это может оказаться полезно для рекомендаций типа "в том же стиле", хотя и может мешать для поиска кросс-временных аналого
- Узкие жанры: для них сформировались практически идеальные кластеры (документалка, хорроры, экшн/триллеры)
- Проблема массовых жанров: драммы и комедии оказались размыты, как самые распространенные жанры

In [19]:
n_clusters = 30
kmeans_30 = MiniBatchKMeans(
    n_clusters=n_clusters, 
    random_state=42, 
    batch_size=2048,
    n_init=10
)

kmeans_30.fit(final_matrix)
movies['cluster_30'] = kmeans_30.labels_

In [20]:
for i in range(n_clusters):
    print(f"\n--- Cluster {i} ---")
    examples = movies[movies['cluster_30'] == i][['title', 'genres']].head(15)
    print(examples)


--- Cluster 0 ---
                   title                                genres
0              Toy Story           {Animation, Family, Comedy}
1                Jumanji          {Adventure, Fantasy, Family}
5                   Heat      {Thriller, Action, Crime, Drama}
9              GoldenEye         {Adventure, Action, Thriller}
20            Get Shorty             {Thriller, Crime, Comedy}
22             Assassins  {Adventure, Action, Crime, Thriller}
31        Twelve Monkeys  {Mystery, Thriller, Science Fiction}
46                 Se7en            {Mystery, Thriller, Crime}
49    The Usual Suspects              {Thriller, Crime, Drama}
51      Mighty Aphrodite                     {Romance, Comedy}
69   From Dusk Till Dawn     {Thriller, Action, Crime, Horror}
75             Screamers             {Horror, Science Fiction}
93          Broken Arrow  {Adventure, Action, Drama, Thriller}
110  Rumble in the Bronx     {Thriller, Action, Crime, Comedy}
130                 Jade  {Mystery, 

Сравнение с разделением на 20 кластеров:
- Стабильность документалок: В обоих случаях они (Cluster 2 здесь и Cluster 16 ранее) выделяются в идеально чистую группу. Это самый надежный кластер
- Дрейф "Блокбастеров": Если раньше хиты 90-х дробились на семейные и боевики, то в текущем Cluster 0 они объединились в одну группу. Это может говорить о том, что признак "популярности" или "года" перевесил жанровые различия между мультиком и боевиком
- Безжанровые: В новом разделении более явно прослеживается проблема фильмов с отсутствующими жанрами. Раньше они были сильнее размыты по разным группам, сейчас же алгоритм четче собрал "неизвестное/неполное" в отдельные группы.
- Ретро-смешение: Группа классики здесь стала более смешанной, объединив комедии и ужасы. В прошлом варианте классические хорроры и старые комедии разделялись лучше

Текущая модель стала более "грубой" в различении жанров внутри одной эпохи (смешала семейное и боевики 90-х), но зато лучше изолировала фильмы с неполными данными