In [None]:
from pyspark.sql import SQLContext
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from notebooks import utils
%matplotlib inline

In [None]:
sqlContext = SQLContext(sc)
df = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/minute_weather.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')

In [None]:
df.count()

In [None]:
filteredDF = df.filter((df.rowID%10)==0)
filteredDF.count()

In [None]:
filteredDF.describe().toPandas().transpose()

In [None]:
filteredDF.filter(filteredDF.rain_accumulation == 0.0).count()

In [None]:
filteredDF.filter(filteredDF.rain_duration == 0.0).count()

In [None]:
workingDF = filteredDF.drop('rain_accumulation').drop('rain_duration').drop('hpwren_timestamp')

In [None]:
before = workingDF.count()
workingDF = workingDF.na.drop()
after = workingDF.count()
before - after

In [None]:
workingDF.columns

In [None]:
workingDF.stat.corr('min_wind_speed','avg_wind_speed')

In [None]:
featuresUsed = ['air_pressure',
 'air_temp',
 'avg_wind_direction',
 'avg_wind_speed',
 'max_wind_direction',
 'max_wind_speed',
 'relative_humidity']
assembler = VectorAssembler(inputCols=featuresUsed, outputCol='features_unscaled')
assembled = assembler.transform(workingDF)

In [None]:
scaler = StandardScaler(inputCol='features_unscaled',outputCol='features',withStd=True,withMean=True)
scalerModel = scaler.fit(assembled)
scaledData = scalerModel.transform(assembled)

In [None]:
scaledData = scaledData.select('features','rowID')
elbowset = scaledData.filter((scaledData.rowID%3)==0).select('features')
elbowset.persist()

In [None]:
clusters = range(2,31)
wsseList = utils.elbow(elbowset,clusters)

In [None]:
scaledDataFeat = scaledData.select('features')
scaledDataFeat.persist()

In [None]:
kmeans = KMeans(k=20,seed=1)
model = kmeans.fit(scaledDataFeat)
transformed = model.transform(scaledDataFeat)

In [None]:
centers = model.clusterCenters()
centers

In [None]:
P = utils.pd_centers(featuresUsed,centers)

In [None]:
utils.parallel_plot(P[P['relative_humidity']<-0.5],P)

In [None]:
utils.parallel_plot(P[P['air_temp']>0.5],P)

In [None]:
utils.parallel_plot(P[(P['relative_humidity']>0.5) & (P['air_temp']<0.5)],P)

In [None]:
utils.parallel_plot(P.iloc[[7,8,11]],P)