In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import os

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *

In [None]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'joinAppend'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

In [None]:
''' create a dataframe '''
# get some data
n = 100
p = 5
randData = pd.DataFrame(np.random.rand(n, p+1))
randData[5] = randData[5] > 0.5
display(randData.head())
randData.values.tolist()

# define the schema
schem = StructType([StructField('col_%d'%i, FloatType(), False) for i in range(p)])
schem = schem.add(StructField('Flag', BooleanType(), False))

# create the dataframe
randData = spark.createDataFrame(randData, schema=schem).repartition('Flag')

# talk
randData.printSchema()
display(randData.limit(5).toPandas())

In [None]:
# compute the skewness by column - though in reality, we'd do this in a single transaction
for clmn in randData.schema:
    print('Processing %s'%clmn.name)
    if clmn.dataType == FloatType():
        skew = randData.agg(skewness(col(clmn.name))).collect()
        print(skew)

In [None]:
print(skew)
print('Notice that it is a single-element list of a Row object')
print(skew[0])
print('Notice that it is now just a row object with a single column')
print(skew[0][0])
print('Now we finally have just the numeric value')

In [None]:
# in reality, would compute the skews for a set of columns like this - all at once - and it still needs the double indexing
skew = randData.agg(*[skewness(col(clmn.name)) for clmn in randData.schema if clmn.dataType == FloatType()]).collect()
print(skew)

In [None]:
# get the first 4 centered moments by column
for clmn in randData.schema:
    if clmn.dataType == FloatType():
        # compute the moments - and just take the first row
        moments = randData.agg(mean(col(clmn.name)), variance(col(clmn.name)), skewness(col(clmn.name)), kurtosis(col(clmn.name))).collect()[0]
        print('%s: Mean = %0.2f, Variance = %0.2f, Skewness = %0.2f, Kurtosis = %0.2f'%(clmn.name, *moments))

In [None]:
# be sure to stop the spark context at the end
sc.stop()