In [18]:
%load_ext autoreload
%env SPARK_HOME=/usr/hdp/current/spark2-client

import findspark
findspark.init()

print('findspark initialized ...')

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, column, max, min

spark = SparkSession.builder.appName('mlonspark')\
    .config('spark.executor.instances', '7')\
    .getOrCreate()

print('pyspark ready ...')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
env: SPARK_HOME=/usr/hdp/current/spark2-client
findspark initialized ...
pyspark ready ...


In [19]:
df = spark.read.load("/data/lastfm-dataset-360K/coo-data.parquet")
df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- artistId: integer (nullable = true)
 |-- listenCount: float (nullable = true)



In [20]:
from pyspark.sql import DataFrameStatFunctions as statFunc
quantiles = statFunc(df).approxQuantile("listenCount", [0.05, 0.95], 0.001)
quantiles

[6.0, 755.0]

In [21]:
dfFiltered = df.where(col("listenCount") > quantiles[0]).where(col("listenCount") < quantiles[1])
dfFiltered.count()   

15704882

In [22]:
mean = dfFiltered.select(avg("listenCount")).collect()[0][0]
stdDev = dfFiltered.select(stddev("listenCount")).collect()[0][0]

def zscore(listenCount, **model_kargs):
    return (listenCount - mean)/stdDev
    
dfStd = dfFiltered.withColumn("stdCount", zscore(dfFiltered.listenCount))

In [23]:
dfStd.select(min("stdCount")).show()
stdMin = dfStd.select(min("stdCount")).collect()[0][0]
stdMin

absStdMin = abs(stdMin)
print(absStdMin)

def shift(stdCount, **model_kargs):
    return stdCount + absStdMin

dfStdPos = dfStd.withColumn("stdCountPos", shift(dfStd.stdCount))

+-------------------+
|      min(stdCount)|
+-------------------+
|-0.9370185692150685|
+-------------------+

0.9370185692150685


In [24]:
dfStdPos.select(\
    min("stdCountPos"),\
    max("stdCountPos"),\
    avg("stdCountPos"),\
    variance("stdCountPos"),\
    stddev("stdCountPos"),\
    skewness("stdCountPos"),\
    kurtosis("stdCountPos")\
).show()

+----------------+-----------------+------------------+---------------------+------------------------+---------------------+---------------------+
|min(stdCountPos)| max(stdCountPos)|  avg(stdCountPos)|var_samp(stdCountPos)|stddev_samp(stdCountPos)|skewness(stdCountPos)|kurtosis(stdCountPos)|
+----------------+-----------------+------------------+---------------------+------------------------+---------------------+---------------------+
|             0.0|4.898816629880709|0.9370185692149076|   1.0000000000000246|      1.0000000000000122|    1.643280216705236|   2.4117649777425028|
+----------------+-----------------+------------------+---------------------+------------------------+---------------------+---------------------+

