In [1]:
"""
Te version 2.7
Zone 5, step 3:  column imputation
							
	Task description						Automation
	After imputing at the raw column level, create table with row-wise listing of:						
1	Column name						Table 1
2	Imputation approach (Te version 2.7 implements only zero of forward fill)						Table 1
3	Column descriptive statistics:						Table 1
	minimum						
	maximum						
	mean						
	standard deviation						
	Median							
				
Input:  General preprocessing dataframe
Output: of the script should be in excell format and should looks like table bellow:
         minimum	maximum	mean	standard deviation	median	imputation approach
Column 1	1	     13	     7	      4	       8	     5            ffill
Column 2	1	     6	     4	      2	       4	     3            bfill
Column 3	101	     112	 106	  4	       107       7            bfill 	
…	…	…	…	…	…	
"""

from sys import argv
import pyspark
from pyspark.sql import SparkSession


def start_spark_session():
    """
    Starting spark session
    """

    spark = SparkSession \
        .builder \
        .appName("Python Spark") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    return spark 


def calc_summary_func(df, column, func):
    """
    for a column, calculate a statistical function
    """

    if func == 'median':
        #summary_value  = df.approxQuantile(column, [0.5], 0.25) # need to verify this function
        summary_value = 0 # temporary zero. will ber removed when verify function above  
    else: 
        summary_value = df.agg({column : func}).collect()[0][0]
    return summary_value


def get_summary_stats_for_every_column(df, columns):
    """
    Input: Input df, and columns of that dataframe
    Calculate summary statistics for every column 
    Output: 
    """
   
    columns_summary_stats = [] # append tuples to a list, later to create a spark df
    for col in columns: # for each column calculate stat values
        maximum = calc_summary_func(df, col, 'max')
        minimum = calc_summary_func(df, col, 'min')
        mean = calc_summary_func(df, col, 'avg')
        stddev = calc_summary_func(df, col, 'stddev')
        median = calc_summary_func(df, col, 'median')
      
        columns_summary_stats.append((col,maximum, minimum, mean, stddev, median))
    
    return columns_summary_stats



"""
****** MAIN ******
1. Create spark session 
2. Read the file into a dataframe
3. a. Calculate statistical summary for every column in a dataframe
   b. Get imputation approach from the te_constants.py ?
4. Create a new dataframe that has columns as rows and stats summaries as values
5. Save it as an excel tab 
"""

#file_name = "../example.csv"
file_name = "../general_preprocessing_2020_06_30_1.csv"
spark = start_spark_session()
gen_pre_df = spark.read.format("csv").option("header", "true").load(file_name) 
columns = gen_pre_df.schema.names
columns_summary_stats = get_summary_stats_for_every_column(gen_pre_df, columns)
excel_ready_df = spark.createDataFrame( columns_summary_stats, ['column', 'max', 'min', 'mean', 'stddev', 'median' ] )
excel_ready_df.show()


+--------------------+--------------------+--------------------+--------------------+------------------+------+
|              column|                 max|                 min|                mean|            stddev|median|
+--------------------+--------------------+--------------------+--------------------+------------------+------+
|                 _c0|                9999|                   0|             45659.0|26361.668953235872|     0|
|             level_0|                9999|                   0|             45659.0|26361.668953235872|     0|
|               index|                9999|                   0|             45659.0|26361.668953235872|     0|
|          Unnamed: 0|                9999|                   0|             45659.0|26361.668953235872|     0|
|            party_id|                 999|                   1|  501.15303496534125|288.93613666356515|     0|
|          event_date|          2020-06-29|          2019-01-01|                null|              null|