# Kmeans Clustering

In [1]:
import pyspark
from pyspark.sql import SparkSession
import numpy as np
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [2]:
# Change the number of cores in this code block
# by setting `spark.master` to `local[n]` where
# n is the number of cores

conf = pyspark.SparkConf().setAll([('spark.master', 'local[4]'),
                                   ('spark.app.name', 'Basic Setup')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-05-18 19:16:42,912 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2022-05-18 19:16:44,449 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
2022-05-18 19:16:44,449 WARN util.Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
2022-05-18 19:16:44,451 WARN util.Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [3]:
df = spark.read.option("header",True).csv("file:///home/work/data/final_movies_df_10.csv", inferSchema=True)
df.printSchema()

[Stage 1:>                                                          (0 + 1) / 1]

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- avg_rating: double (nullable = true)
 |-- movieYear: integer (nullable = true)
 |-- genres: string (nullable = true)
 |-- Crime: double (nullable = true)
 |-- Romance: double (nullable = true)
 |-- Thriller: double (nullable = true)
 |-- Adventure: double (nullable = true)
 |-- Drama: double (nullable = true)
 |-- War: double (nullable = true)
 |-- Documentary: double (nullable = true)
 |-- Fantasy: double (nullable = true)
 |-- Mystery: double (nullable = true)
 |-- Musical: double (nullable = true)
 |-- Animation: double (nullable = true)
 |-- Film-Noir: double (nullable = true)
 |-- Horror: double (nullable = true)
 |-- Western: double (nullable = true)
 |-- Comedy: double (nullable = true)
 |-- Children: double (nullable = true)
 |-- Action: double (nullable = true)
 |-- Sci-Fi: double (nullable = true)



                                                                                

In [4]:
df.show(5)

+-------+--------------------+------------------+---------+--------------------+-----+-------+--------+---------+-----+---+-----------+-------+-------+-------+---------+---------+------+-------+------+--------+------+------+
|movieId|               title|        avg_rating|movieYear|              genres|Crime|Romance|Thriller|Adventure|Drama|War|Documentary|Fantasy|Mystery|Musical|Animation|Film-Noir|Horror|Western|Comedy|Children|Action|Sci-Fi|
+-------+--------------------+------------------+---------+--------------------+-----+-------+--------+---------+-----+---+-----------+-------+-------+-------+---------+---------+------+-------+------+--------+------+------+
|   1088|Dirty Dancing (1987)|  3.25002094679514|     1987|Drama|Musical|Rom...|  0.0|    1.0|     0.0|      0.0|  1.0|0.0|        0.0|    0.0|    0.0|    1.0|      0.0|      0.0|   0.0|    0.0|   0.0|     0.0|   0.0|   0.0|
|   1580|Men in Black (a.k...|3.5817083457378187|     1997|Action|Comedy|Sci-Fi|  0.0|    0.0|     0

*Checking null values in each column*

In [5]:
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

[Stage 3:>                                                          (0 + 1) / 1]

+-------+-----+----------+---------+------+-----+-------+--------+---------+-----+---+-----------+-------+-------+-------+---------+---------+------+-------+------+--------+------+------+
|movieId|title|avg_rating|movieYear|genres|Crime|Romance|Thriller|Adventure|Drama|War|Documentary|Fantasy|Mystery|Musical|Animation|Film-Noir|Horror|Western|Comedy|Children|Action|Sci-Fi|
+-------+-----+----------+---------+------+-----+-------+--------+---------+-----+---+-----------+-------+-------+-------+---------+---------+------+-------+------+--------+------+------+
|      0|    0|         0|       24|     0|    0|      0|       0|        0|    0|  0|          0|      0|      0|      0|        0|        0|     0|      0|     0|       0|     0|     0|
+-------+-----+----------+---------+------+-----+-------+--------+---------+-----+---+-----------+-------+-------+-------+---------+---------+------+-------+------+--------+------+------+



                                                                                

*Printing movies without year in it*

In [6]:
#Doing toPandas() for pretty printing
df.filter(col('movieYear').isNull()).select('title').toPandas()

Unnamed: 0,title
0,The King
1,Science Fiction Volume One: The Osiris Child
2,Mara
3,Angel Has Fallen
4,Trophy Kids
5,Falling Inn Love
6,The Perfect Date
7,In the Tall Grass
8,Tau
9,Ready Player One


In [7]:
df.describe().toPandas()

                                                                                

Unnamed: 0,summary,movieId,title,avg_rating,movieYear,genres,Crime,Romance,Thriller,Adventure,...,Mystery,Musical,Animation,Film-Noir,Horror,Western,Comedy,Children,Action,Sci-Fi
0,count,24009.0,24009,24009.0,23985.0,24009,24009.0,24009.0,24009.0,24009.0,...,24009.0,24009.0,24009.0,24009.0,24009.0,24009.0,24009.0,24009.0,24009.0,24009.0
1,mean,70788.93344162605,,3.2003455170428925,1994.750760892224,,0.1074597026115206,0.1481527760423174,0.1737265192219584,0.0915906535049356,...,0.0574784455829064,0.0301137073597401,0.0574367945353825,0.010745970261152,0.1104169269857136,0.0191594818609688,0.3237119413553251,0.0558957057770002,0.1429047440543129,0.0758049064933983
2,stddev,62574.92667477875,,0.4939954796797395,22.038548812964635,,0.3097032610945801,0.3552587616392982,0.3788820328483236,0.2884532393338362,...,0.2327593830593779,0.1709037405567116,0.2326801756902985,0.1031064361423084,0.3134152526066884,0.1370882156546442,0.4679013134320066,0.2297249963049888,0.3499829709256203,0.2646912177941029
3,min,1.0,"""""""Great Performances"""" Cats (1998)""",0.6785714285714286,1874.0,Action,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,max,208737.0,貞子3D (2012),4.483096085409253,2019.0,Western,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
df.columns

['movieId',
 'title',
 'avg_rating',
 'movieYear',
 'genres',
 'Crime',
 'Romance',
 'Thriller',
 'Adventure',
 'Drama',
 'War',
 'Documentary',
 'Fantasy',
 'Mystery',
 'Musical',
 'Animation',
 'Film-Noir',
 'Horror',
 'Western',
 'Comedy',
 'Children',
 'Action',
 'Sci-Fi']

### Creating features

In [9]:
feat_cols = [
 'avg_rating',
 'movieYear',
 'Crime',
 'Romance',
 'Thriller',
 'Adventure',
 'Drama',
 'War',
 'Documentary',
 'Fantasy',
 'Mystery',
 'Musical',
 'Animation',
 'Film-Noir',
 'Horror',
 'Western',
 'Comedy',
 'Children',
 'Action',
 'Sci-Fi']

### Vector assembler to create feature vector column

In [10]:
assembler = VectorAssembler(inputCols=feat_cols, outputCol='features', handleInvalid='skip')
final_df = assembler.transform(df)

In [11]:
final_df.show(5)

2022-05-18 19:17:03,912 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+--------------------+------------------+---------+--------------------+-----+-------+--------+---------+-----+---+-----------+-------+-------+-------+---------+---------+------+-------+------+--------+------+------+--------------------+
|movieId|               title|        avg_rating|movieYear|              genres|Crime|Romance|Thriller|Adventure|Drama|War|Documentary|Fantasy|Mystery|Musical|Animation|Film-Noir|Horror|Western|Comedy|Children|Action|Sci-Fi|            features|
+-------+--------------------+------------------+---------+--------------------+-----+-------+--------+---------+-----+---+-----------+-------+-------+-------+---------+---------+------+-------+------+--------+------+------+--------------------+
|   1088|Dirty Dancing (1987)|  3.25002094679514|     1987|Drama|Musical|Rom...|  0.0|    1.0|     0.0|      0.0|  1.0|0.0|        0.0|    0.0|    0.0|    1.0|      0.0|      0.0|   0.0|    0.0|   0.0|     0.0|   0.0|   0.0|(20,[0,1,3,6,11],...|
|   1580|Men in 

#### We might not need scaling, but did it anyway for learning purpose

In [12]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol='features',
                       outputCol='scaled_feat',
                       withStd=True,
                       withMean=False
                       )
scaled_model = scaler.fit(final_df)
cluster_df = scaled_model.transform(final_df)

                                                                                

#### Cluster Evaluator to check how close the points in individual clusters are ?

In [13]:
eval = ClusteringEvaluator(predictionCol='prediction',
                          featuresCol='scaled_feat',
                          metricName='silhouette',
                          distanceMeasure='squaredEuclidean')

In [14]:
### KMeans clustering 

In [15]:
silhouette_score = []
print("""
Silhouette Scores for K Means Clustering
========================================
Model\tScore\t
=====\t=====\t
""")
for k in range(8,23):
    kmeans_algo = KMeans(featuresCol='scaled_feat',k=k)
    kmeans_fit = kmeans_algo.fit(cluster_df)
    output = kmeans_fit.transform(cluster_df)
    score = eval.evaluate(output)
    silhouette_score.append(score)
    print(f"K{k}\t{np.round(score,3)}\t")


Silhouette Scores for K Means Clustering
Model	Score	
=====	=====	



                                                                                

K8	0.271	


                                                                                

K9	0.287	


                                                                                

K10	0.286	


                                                                                

K11	0.297	


                                                                                

K12	0.302	


                                                                                

K13	0.31	


                                                                                

K14	0.363	


                                                                                

K15	0.346	


                                                                                

K16	0.356	
K17	0.364	


                                                                                

K18	0.345	


                                                                                

K19	0.245	


                                                                                

K20	0.393	


                                                                                

K21	0.398	
K22	0.361	


#### The above score should give us idea if points inside clusters are close to each other.

In [16]:
spark.stop()