# Chapter 7 - K-Means Clustering

Switched to Jupyter Notebooks because the fit operations begin to take a longer period of time. With Jupyter, the results are retained in memory (for each code block) so you only need to run them (and wait) once. From there, it's possible to work on the results from memory.

## Feature Engineering

In [15]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer

stop_words = nltk.corpus.stopwords.words('english')
stop_words = stop_words + ['one', 'two', 'get']

# load the corpus created in b_movie_recommender.py
df = pd.read_csv('./data/norm_corpus.csv')
# For some reason, a row is nan, drop it
df.dropna(inplace=True)
print(df.info())

norm_corpus = df['description']

cv = CountVectorizer(ngram_range=(1, 2), min_df=10, max_df=0.8,
                     stop_words=stop_words)
cv_matrix = cv.fit_transform(norm_corpus)
print('Norm corpus matrix shape:\n', cv_matrix.shape, '\n')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3959 entries, 0 to 4798
Data columns (total 7 columns):
Unnamed: 0     3959 non-null int64
title          3959 non-null object
tagline        3959 non-null object
overview       3959 non-null object
genres         3959 non-null object
popularity     3959 non-null float64
description    3959 non-null object
dtypes: float64(1), int64(1), object(5)
memory usage: 247.4+ KB
None
Norm corpus matrix shape:
 (3959, 2551) 



## K-Means Clustering

In [16]:
# Kmeans clustering starting on page 502
from sklearn.cluster import KMeans

NUM_CLUSTERS=6
km = KMeans(n_clusters=NUM_CLUSTERS, max_iter=10000, n_init=50, random_state=42).fit(cv_matrix)
print('K-Means:\n', km, '\n')

K-Means:
 KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=10000,
    n_clusters=6, n_init=50, n_jobs=None, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0) 



In [17]:
df['kmeans_cluster'] = km.labels_

# viewing distribution of movies across the cluster
from collections import Counter
print('Labels:\n', Counter(km.labels_))

Labels:
 Counter({2: 2187, 3: 524, 1: 441, 4: 419, 0: 387, 5: 1})


In [22]:
movie_clusters = (df[['title', 'kmeans_cluster', 'popularity']]
                 .sort_values(by=['kmeans_cluster', 'popularity'], ascending=False)
                 .groupby('kmeans_cluster').head(20))
movie_clusters = movie_clusters.copy(deep=True)
feature_names = cv.get_feature_names()
topn_features = 15
ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]

# get key features for each cluster
# get movies belonging to each cluster
for cluster_num in range(NUM_CLUSTERS):
    key_features = [feature_names[index] for index in ordered_centroids[cluster_num, :topn_features]]
    movies = movie_clusters[movie_clusters['kmeans_cluster'] == cluster_num]['title'].values.tolist()
    print('CLUSTER #' + str(cluster_num+1))
    print('Key Features:', key_features)
    print('Popular Movies:', movies)
    print('-'*80)

CLUSTER #1
Key Features: ['love', 'life', 'story', 'find', 'man', 'young', 'falls', 'true', 'woman', 'finds', 'fall', 'father', 'never', 'new', 'falls love']
Popular Movies: ['Pirates of the Caribbean: The Curse of the Black Pearl', 'Frozen', 'Forrest Gump', 'Pirates of the Caribbean: On Stranger Tides', 'Twilight', 'Spider-Man 3', 'Bruce Almighty', 'Quantum of Solace', 'The Twilight Saga: Eclipse', 'The Twilight Saga: New Moon', 'Aladdin', 'The Age of Adaline', 'The Fault in Our Stars', 'Amélie', 'Sex Tape', 'Million Dollar Baby', 'The Hunger Games', 'Grease', 'Troy', 'Room']
--------------------------------------------------------------------------------
CLUSTER #2
Key Features: ['new', 'york', 'new york', 'city', 'young', 'family', 'york city', 'years', 'friends', 'man', 'must', 'town', 'find', 'back', 'home']
Popular Movies: ['Terminator Genisys', 'Fight Club', 'Teenage Mutant Ninja Turtles', 'Pixels', 'Despicable Me 2', 'Avengers: Age of Ultron', 'Night at the Museum: Secret of th

## More Movie Groupings

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim_features = cosine_similarity(cv_matrix)
km = KMeans(n_clusters=NUM_CLUSTERS, max_iter=10000, n_init=50, random_state=42).fit(cosine_sim_features)
print('Labels:\n', Counter(km.labels_))

Labels:
 Counter({3: 2030, 2: 576, 4: 438, 5: 390, 1: 280, 0: 245})


In [24]:
df['kmeans_cluster'] = km.labels_

movie_clusters = (df[['title', 'kmeans_cluster', 'popularity']]
                 .sort_values(by=['kmeans_cluster', 'popularity'], ascending=False)
                 .groupby('kmeans_cluster').head(20))

# get movies belonging to each cluster
for cluster_num in range(NUM_CLUSTERS):
    movies = movie_clusters[movie_clusters['kmeans_cluster'] == cluster_num]['title'].values.tolist()
    print('CLUSTER #' + str(cluster_num+1))
    print('Popular Movies:', movies)
    print('-'*80)

CLUSTER #1
Popular Movies: ['The Imitation Game', 'Maleficent', 'Titanic', '12 Years a Slave', 'The Prestige', 'The Grand Budapest Hotel', 'The Fault in Our Stars', 'Catch Me If You Can', 'Cloud Atlas', 'The Conjuring 2', 'Apollo 13', 'Aliens', 'The Usual Suspects', 'GoodFellas', 'Straight Outta Compton', "The Huntsman: Winter's War", 'Mary Poppins', 'The Lego Movie', 'Starship Troopers', 'The Big Short']
--------------------------------------------------------------------------------
CLUSTER #2
Popular Movies: ['Pirates of the Caribbean: The Curse of the Black Pearl', 'Frozen', 'Forrest Gump', 'Pirates of the Caribbean: On Stranger Tides', 'Spider-Man 3', 'Quantum of Solace', 'The Twilight Saga: New Moon', 'Aladdin', 'Sex Tape', 'Grease', 'Troy', 'The Princess and the Frog', '50 First Dates', 'The Theory of Everything', 'Braveheart', 'Slumdog Millionaire', 'Cast Away', 'Moulin Rouge!', "There's Something About Mary", 'Need for Speed']
--------------------------------------------------

## Affinity Propogation - Starting on Page 510