# Hands-On: Clustering

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from notebooks import utils
%matplotlib inline

In [2]:
spark = SparkSession.builder.appName("clustering").getOrCreate()
df = spark.read.csv('data/minute_weather.csv', header=True, inferSchema=True)

In [3]:
df.count()

1587257

In [4]:
filteredDF = df.filter((df.rowID % 10)==0)
filteredDF.count()

158726

In [5]:
filteredDF.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
rowID,158726,793625.0,458203.9375103623,0,1587250
air_pressure,158726,916.8301614102413,3.051716552830777,905.0,929.5
air_temp,158726,61.851589153636084,11.833569210641707,31.64,99.5
avg_wind_direction,158680,162.15610032770354,95.27820101905918,0.0,359.0
avg_wind_speed,158680,2.7752148979077367,2.057623969742644,0.0,31.9
max_wind_direction,158680,163.46214393748426,92.45213853838698,0.0,359.0
max_wind_speed,158680,3.400557726241551,2.4188016208098855,0.1,36.0
min_wind_direction,158680,166.77401688933702,97.44110914784568,0.0,359.0
min_wind_speed,158680,2.134664103856835,1.742112505242438,0.0,31.6


In [6]:
filteredDF.filter(filteredDF.rain_accumulation==0.0).count()

157812

In [7]:
filteredDF.filter(filteredDF.rain_duration==0.0).count()

157237

In [8]:
workingDF = filteredDF.drop('rain_accumulation').drop('rain_duration').drop('hpwren_timestamp')

In [9]:
before = workingDF.count()
workingDF = workingDF.na.drop()
after = workingDF.count()
before - after

46

In [10]:
workingDF.columns

['rowID',
 'air_pressure',
 'air_temp',
 'avg_wind_direction',
 'avg_wind_speed',
 'max_wind_direction',
 'max_wind_speed',
 'min_wind_direction',
 'min_wind_speed',
 'relative_humidity']

In [11]:
featuresUsed = ['air_pressure',
 'air_temp',
 'avg_wind_direction',
 'avg_wind_speed',
 'max_wind_direction',
 'max_wind_speed', 'relative_humidity']
assembler = VectorAssembler(inputCols=featuresUsed, outputCol='features_unscaled')
assembled = assembler.transform(workingDF)

In [12]:
scaler = StandardScaler(inputCol="features_unscaled", outputCol="features", withStd=True, withMean=True)
scalerModel = scaler.fit(assembled)
scaledData = scalerModel.transform(assembled)

In [13]:
scaledData = scaledData.select("features","rowID")
elbowset = scaledData.filter((scaledData.rowID % 3)==0).select("features")
elbowset.persist()

DataFrame[features: vector]

In [None]:
clusters = range(2,31)
wsseList = utils.elbow(elbowset,clusters)

Training for cluster size 2 
......................WSSE = 114993.08947326531 
Training for cluster size 3 
......................WSSE = 103421.37370631342 
Training for cluster size 4 
......................WSSE = 95150.55982034055 
Training for cluster size 5 
......................WSSE = 87993.46098416099 
Training for cluster size 6 
......................WSSE = 84411.7705174048 
Training for cluster size 7 


In [None]:
utils.elbow_plot(wsseList,clusters)

In [None]:
scaledDataFeat = scaledData.select("features")
scaledDataFeat.persist()

In [None]:
kmeans = KMeans(k=12, seed=1)
model = kmeans.fit(scaledDataFeat)
transformed = model.transform(scaledDataFeat)

In [None]:
model.clusterCenters()

In [None]:
P = utils.pd_centers(featuresUsed, model.clusterCenters())

In [None]:
utils.parallel_plot(P[P['relative_humidity']<-0.5],P)

In [None]:
utils.parallel_plot(P[P['air_temp']>0.5],P)

In [None]:
utils.parallel_plot(P[(P['relative_humidity']>0.5) & (P['air_temp'] < 0.5)],P)

In [None]:
utils.parallel_plot(P.iloc[[2]],P)