In [2]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np
from scipy import stats

In [4]:
def col_mode(df, col_name):
mode_df = df.select(df[col_name].alias('observed')).withColumn('count', lit(1))
agg_mode = mode_df.groupBy('observed').agg(count(lit(1)).alias('count'))

# If there are multiple selections with the same mode, it will show all:
# agg_mode.groupBy('observed').agg(max('count')).show()

# If there are multiple selections with the same mode, it will pick the first:
agg_mode.groupBy('observed').agg(first('observed'), (max('count'))).show()

In [5]:
null_opt = ['Negatives', 'NaN', 'null']
dbutils.widgets.multiselect('Denote Nulls', 'null', null_opt)
null_list = dbutils.widgets.get('Denote Nulls')

def col_non_null(df, col_name):
  non_null_total = 0
  filtered_df = df.select(df[col_name].alias('observed') )
  for each_null in null_list:
    if each_null == 'Negatives': filtered_df = filtered_df.select(filtered_df['observed'] >= 0)
    if each_null == 'NaN': filtered_df = filtered_df.select(filtered_df['observed'] != 'nan')
    if each_null == 'null': filtered_df = filtered_df.select(filtered_df['observed'] != 'null')
  non_null_total = filtered_df.count()
  return non_null_total

In [6]:
# Normal random variable y
# x is supposed to be a random variable 
e = 2.71828
pi = 3.14159
y = ( (1/stddev) * (sqrt(2 * pi)) ) * (e**( ((x - mean)**2) / (2(stddev**2)) ))

In [7]:
def col_skew(df, col_name): 
  skew_z = col_zscores(df, col_name)
  skew_val = skew_z.agg(avg(skew_z['z-cubed'])).collect()
  return skew_val

In [8]:
def col_zscores(df, col_name):
  z_mean= col_mean(df, col_name)
  z_sd = col_stdev(df, col_name)
  z_df = df.select(df[col_name]).withColumn('m', lit(z_mean)).withColumn('sd', lit(z_sd))
  z_df = z_df.withColumn('z-score',(z_df[col_name]-z_df['m'])/z_df['sd'])
  z_df = z_df.select(z_df[col_name], z_df['z-score'])
  z_df = z_df.withColumn('z-cubed', z_df['z-score'] ** 3)
  z_df = z_df.withColumn('z-quartic', z_df['z-score'] ** 4)
  return z_df

* Implicit ordering depending which functions call on others; that way we don't have to re-call functions inside functions.
* .na function to get all NA's.