## The Z-Score Zone
---
* col_make_std_zscores
* col_make_mod_zscores
---
* col_get_std_zscores
* col_get_mod_zscores
---
* col_get_std_zscores
* col_get_mod_zscores
---
* get_z_outliers
* get_mod_outliers
---
* remove_z_outliers
* remove_mod_outliers

In [None]:
# Make a standard z-score table with associated column values:
def col_make_std_zscores(df, col_name):
  '''
  This helper function gets standard z-scores of the df column for later use (finding outliers). Uses mean and stddev.
  '''
  z_mean= col_mean(df, col_name)
  z_sd = col_stddev(df, col_name)
  z_df = df.select(col_name).withColumn('z_score', (df[col_name] - z_mean) / z_sd )
  return z_df


# Make a modified z-score table with associated column values:
def col_make_mod_zscores(df, col_name):
  '''
  This helper function gets the more robust modified z-scores of the df column for later use (finding outliers). Uses median and mad. 
  0.6745 is a constant used in calculating modified z-scores.
  '''
  z_med = col_median(df, col_name)
  z_mad = col_median_absolute_deviation(df, col_name)
  mod_z_df = df.select(df[col_name]).withColumn('mod_z_score', ((0.6745 * (df[col_name] - z_med )) / z_mad ))
  return mod_z_df

In [None]:
# Get min and max standard z-scores from the std z-score table:
def col_get_std_zscores(df,col_name):
  '''
  This function returns the min and max z-scores found in a column. 
  '''
  z_df = col_make_std_zscores(df, col_name)
  return col_min(z_df, 'z_score'),col_max(z_df, 'z_score')


# Get min and max modified z-scores from the mod z-score table:
def col_get_mod_zscores(df, col_name):
  '''
  This function returns the min and max modified z-scores found in a column. 
  '''
  mod_z_df = col_make_mod_zscores(df, col_name)
  return col_min(mod_z_df, 'mod_z_score'), col_max(mod_z_df, 'mod_z_score')

In [None]:
# Get the values associated with min and max standard z-scores:
def col_get_z_vals(df, col_name):
  z_df = col_make_std_zscores(df, col_name)
  
  z_min_val = z_df.select(z_df[col_name]).filter(z_df[z_score] == col_min(z_df, 'z_score')).first()[0]
  
  z_max_val = z_df.select(z_df[col_name]).filter(z_df[z_score] == col_max(z_df, 'z_score')).first()[0]
  
  return z_min_val, z_max_val

# Get the values associated with min and max modified z-scores:
def col_get_mod_z_vals(df, col_name):
  mod_z_df = col_make_mod_zscores(df, col_name) 
  mod_min_val = mod_z_df.select(mod_z_df[col_name]).filter(mod_z_df[z_score] == col_min(mod_z_df, 'mod_z_score')).first()[0]
  mod_max_val = mod_z_df.select(mod_z_df[col_name]).filter(mod_z_df[z_score] == col_max(mod_z_df, 'mod_z_score')).first()[0]
  return mod_min_val, mod_max_val

In [None]:
def get_z_outliers(df, col_name,threshold=3.0):
  '''
  Less robust z-score outlier detection using mean and sd.
  This function returns a dataframe of outliers.
  Default threshold = 3.0 for standard outlier-finding using z-scores.
  '''
  z_df = col_make_std_zscores(df, col_name)
  z_outliers = z_df.select(z_df).filter(abs(z_df['z_score']) > threshold)
  return z_outliers
  
def get_mod_outliers(df, col_name,threshold=3.5):
  '''
  More robust z-score outlier detection using mad.
  This function returns a dataframe of outliers.
  Default threshold = 3.5 for modified z-score outlier-finding.
  '''
  mod_z_df = col_make_mod_zscores(df, col_name)
  mod_outliers = mod_z_df.select(z_df).filter(abs(['mod_z_score']) > threshold))
  return mod_outliers


abs(col(col_name)- median)

In [None]:
# Returns the column after it has had standard z-score outliers removed:
def remove_z_outliers(df, col_name,threshold=3.0):
  '''
  Less robust z-score outlier detection using mean and sd.
  This function returns the column after removing outliers.
  Default threshold = 3.0 for standard outlier-finding using z-scores.
  '''
  z_min, z_max = col_get_z_vals(df, col_name)
  z_df = col_make_std_zscores(df, col_name)
  clean_z_df = z_df.select(z_df[col_name]).filter(z_df['z_score'] < z_max).filter(z_df['z_score'] > z_min)
  return clean_z_df


# Returns the column after it has had modified z-score outliers removed:
def remove_mod_outliers(df, col_name,threshold=3.5):
  '''
  More robust z-score outlier detection using mad.
  This function returns the column after removing outliers.
  Default threshold = 3.5 for modified z-score outlier-finding.
  ''' 
  mod_min, mod_max = col_get_mod_z_vals(df, col_name)
  mod_z_df = col_make_mod_zscores(df, col_name)
  clean_mod_df = mod_z_df.select(mod_z_df[col_name]).filter(mod_z_df['mod_z_score'] < mod_max).filter(mod_z_df['mod_z_score'] > mod_min)
  return clean_mod_df

## Testing Zone:

In [None]:
# 1
print("Z-score tables:")
col_make_std_zscores(df, col_name).show(5)
col_make_mod_zscores(df, col_name).show(5)

In [None]:
# 2
print("Min and max z-scores:")
print(col_get_std_zscores(df, col_name))
print(col_get_mod_zscores(df, col_name))

In [None]:
# 3
print("Associated values from min and max z-scores:")
print(col_get_std_zscores(df, col_name))
print(col_get_mod_zscores(df, col_name))

In [None]:
# 4
get_z_outliers(df, col_name)
get_mod_outliers(df, col_name)

In [None]:
#5
remove_z_outliers(df, col_name)
remove_mod_outliers(df, col_name)