In [1]:
"""
Te version 2.7
Zone 5, step 3:  column imputation
							
	Task description						Automation
	After imputing at the raw column level, create table with row-wise listing of:						
1	Column name						Table 1
2	Imputation approach (Te version 2.7 implements only zero of forward fill)						Table 1
3	Column descriptive statistics:						Table 1
	minimum						
	maximum						
	mean						
	standard deviation						
	Median							
				
Input:  General preprocessing dataframe
Output: of the script should be in excell format and should looks like table bellow:
         minimum	maximum	mean	standard deviation	median	imputation approach
Column 1	1	     13	     7	      4	       8	     5            ffill
Column 2	1	     6	     4	      2	       4	     3            bfill
Column 3	101	     112	 106	  4	       107       7            bfill 	
…	…	…	…	…	…	
"""

from sys import argv
import pyspark
from pyspark.sql import SparkSession

from col_stats import *



def start_spark_session():
    """
    Starting spark session
    """

    spark = SparkSession \
        .builder \
        .appName("Python Spark") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    return spark 




def get_summary_stats_for_every_column(df):
    """
    Input: Input df, and columns of that dataframe
    Calculate summary statistics for every column 
    Output: 
    
    """
    columns = df.columns
    columns_summary_stats = [] # append tuples to a list, later to create a spark df
    for col in columns: # for each column calculate stat values
        one_col_df = df.select(col) # select only nesessary colum
        maximum = calc_column_max(one_col_df)
#         print(maximum)
#         raise SystemExit
        minimum = calc_column_min(one_col_df)
        mean = calc_column_avg(one_col_df)
        stddev = calc_column_stddev(one_col_df)
        median = calc_column_median(one_col_df)
        columns_summary_stats.append((col, maximum, minimum, mean, stddev, median))
    
    return columns_summary_stats



"""
****** MAIN ******
1. Create spark session 
2. Read the file into a dataframe
3. a. Calculate statistical summary for every column in a dataframe
   b. Get imputation approach from the te_constants.py ?
4. Create a new dataframe that has columns as rows and stats summaries as values
5. Save it as an excel tab 
"""

file_name = "../data/example.csv"
#file_name = "../data/general_preprocessing_2020_06_30_1.csv"
spark = start_spark_session()
gen_pre_df = spark.read.format("csv").option("header", "true").load(file_name) 

columns_summary_stats = get_summary_stats_for_every_column(gen_pre_df)
excel_ready_df = spark.createDataFrame( columns_summary_stats,\
                                       ['column', 'max', 'min', 'mean', 'stddev', 'median' ] )
excel_ready_df.show()


+--------+---+---+------------------+------------------+------+
|  column|max|min|              mean|            stddev|median|
+--------+---+---+------------------+------------------+------+
|  event1|  6|  1|               3.5|1.8708286933869707|   2.0|
|  event2| 15| 10|              12.5|1.8708286933869707|  11.0|
|party_id|  2|  0|1.3333333333333333| 0.816496580927726|   1.0|
|     CTU|  3|  0|               1.5|1.0488088481701516|   1.0|
+--------+---+---+------------------+------------------+------+



In [2]:
# trying to work with only one column in spark so we can simplify functions