In [2]:
"""
Zone 5 , step 5: ctu report
For every CTU we get min, max, mean, stddev ... summary statistics

	Create a table with row-wise listing of:			
1	CTU			Table 1
2	Min and max event dates in each CTU			Table 1
3	Length of each CTU in days			Table 2
4	Event descriptive stats across CTUs and all batches			Table 1
	Minimum			
	Maximum			
	Mean			
5	Target event stats across CTUs and all batches:			Table 1
	Minimum			
	Maximum			
	Mean	
    
    
"""

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, col, lit

def start_spark_session():
    """
    Starting spark session
    """

    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    return spark 


def calc_summary_func(df, column, func):
    """
    for a column, calculate a statistical function
    """

    if func == 'median':
        #summary_value  = df.approxQuantile(column, [0.5], 0.25) # need to verify this function
        summary_value = 0 # temporary zero. will ber removed when verify function above  
    else: 
        summary_value = df.agg({column : func}).collect()[0][0]
        
    return summary_value


def create_summary_stats_df(all_columns_summary_stats, CTU_num):
    """
    creata a df:
    colmn_name stats_summary_name stats_summary_value
    event 1       min             1
    event 1       max             5
    event 2       min             101
    event 2       max             199
    ......
    based on input of list of typles like [(event1, min, 1), (event1, max, 5)]
    """
    
    cols_stats_df = spark.createDataFrame( all_columns_summary_stats, [ 'column_plus_summary',  str(CTU_num) ] )
    return cols_stats_df 


def get_ctu_summary_stats_for_every_column(df):
    """
    Input a dataframe
    Output a list with stats for every columns [(column1, min, 1), (column1, max, 5), (column2, min,     105), ... ]
    """
    
    all_columns_summary_stats = []
    columns = df.schema.names
    for col in columns: # for each column calculate stat values
        
        maximum = calc_summary_func(df, col, 'max')
        maximum_row = (col+'_max', str(maximum))
        all_columns_summary_stats.append( maximum_row )
        
        minimum = calc_summary_func(df, col, 'min')
        minimum_row = (col +'_min', str(minimum))
        all_columns_summary_stats.append( minimum_row )
        
        mean = calc_summary_func(df, col, 'avg')
        mean_row = (col + '_mean', str(mean))
        all_columns_summary_stats.append( mean_row )
        
    return all_columns_summary_stats



def drop_garbage_cols(df):
    """
    Drop some of the unnesessary columns
    """
    columns_to_drop = ['level_0', 'index', 'Unnamed: 0', '_c0']
    df_to_drop = df.select('*')
    df_to_drop = df_to_drop.drop(*columns_to_drop)
    
    return df_to_drop



def join_summary_stats_dfs(cols_summary_stats_all_ctus):
    """
    Input: getting a list of dataframes for every CTU: 0,1,2,3 .. max_CTU 
    need to join them together in a loop along the axis 1 
    return a df with columns
    column_name stats_summary_name stats_summary_value
    """
    ctu_summary_joined_df = cols_summary_stats_all_ctus[0].select("*") # get first ctu df as a starting poin for joining
    for ctu_num , ctu_summary_stats in enumerate(cols_summary_stats_all_ctus[1:]):
        ctu_summary_joined_df = ctu_summary_joined_df.join(
                ctu_summary_stats, ctu_summary_joined_df.column_plus_summary \
                == ctu_summary_stats.column_plus_summary).\
                drop(ctu_summary_stats.column_plus_summary)
    #ctu_summary_joined_df.show().column_name == ctu_summary_stats.column_name)
    return ctu_summary_joined_df


"""
***** MAIN *******
"""



#file_name = "../example.csv"
file_name = "../imputed_predict_2020_06_30_1.csv"
spark = start_spark_session()
df = spark.read.format("csv").option("header", "true").load(file_name) 
max_CTU = calc_summary_func(df, 'CTU', 'max') # get the maximum values of a CTU, to determin num of CTUS
max_CTU_int = int(float(max_CTU)) # convert string values like 9.0 into int 
cols_summary_stats_all_ctus = []
for CTU_num in range(max_CTU_int):
    df_single_CTU = df.filter(f"CTU == { CTU_num }") # get a df with one CTU
    df_single_CTU_clean = drop_garbage_cols( df_single_CTU )
    cols_summary_stats =  get_ctu_summary_stats_for_every_column( df_single_CTU_clean ) # getting a list summary stats
    cols_summary_stats_df = create_summary_stats_df( cols_summary_stats, CTU_num )
    #print(cols_summary_stats_df.count())
    cols_summary_stats_all_ctus.append(cols_summary_stats_df)
joined_ctu_summary_stats_df = join_summary_stats_dfs(cols_summary_stats_all_ctus)
joined_ctu_summary_stats_df.select(['column_plus_summary', '0', '1', '2','4']).show(200truncate =False)


+--------------------------------+------------------+------------------+------------------+------------------+
|column_plus_summary             |0                 |1                 |2                 |4                 |
+--------------------------------+------------------+------------------+------------------+------------------+
|cai_ins_grs_erc_max             |99.0              |99.0              |99.0              |99.0              |
|cai_factor_1_mean               |3.9377252703243895|3.8631802392898495|3.9347247800286476|3.9361902963941855|
|cai_ins_grs_evmc_max            |99.50067060851471 |97.71413097044277 |99.88873821354582 |99.96931720656956 |
|td_last_cai_ins_grs_evnt_3_mean |47.46776131357629 |47.620802778849864|47.7646818088807  |47.69378893713423 |
|cai_factor_age_max              |59.0              |59.0              |59.0              |59.0              |
|expanding_cai_ins_grs_mrc_mean  |3840.47456948338  |3585.975299112312 |3343.8246367914876|2863.385690013215 |
|

In [4]:
joined_ctu_summary_stats_df.select(['column_plus_summary', '0', '1', '3', '2','4']).show(n = 200 ,truncate =False)

+---------------------------------+-----------------------+----------------------+----------------------+-----------------------+-----------------------+
|column_plus_summary              |0                      |1                     |3                     |2                      |4                      |
+---------------------------------+-----------------------+----------------------+----------------------+-----------------------+-----------------------+
|cai_ins_grs_erc_max              |99.0                   |99.0                  |99.0                  |99.0                   |99.0                   |
|cai_factor_1_mean                |3.9377252703243895     |3.8631802392898495    |3.9379072441548484    |3.9347247800286476     |3.9361902963941855     |
|cai_ins_grs_evmc_max             |99.50067060851471      |97.71413097044277     |99.72268507175163     |99.88873821354582      |99.96931720656956      |
|td_last_cai_ins_grs_evnt_3_mean  |47.46776131357629      |47.62080277884986