In [6]:
"""
Zone 5, step 6: CTU imputations

After imputing rows for missing CTUs, create table with row-wise listing:		
Column name		Table 1
Imputation approach (min, max, median, cumsum, zero, FFill, etc.)		Table 1
Proportion of accounts that have more than:		Table 1
99% missing		
75% missing		
50% missing		
25% missing		
		
Differences in descriptive statistics between steps 6 and 3		Table 2
Delta min		
Delta max		
Delta mean		
Delta std		
Delta median	

Input files: 
imputed_train_ and  preprocessing_
		
        
TO DO:
Sort by delta max desc
"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

def start_spark_session():
    """
    Starting spark session
    """

    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    return spark

def calc_column_func(df, column, func):

    """
    for a column, calculate a statistical value
    """

    return df.agg({column : func}).collect()[0][0]


def get_descriptive_statistics_for_columns(df):

    """
    Get the columns names and for every column create a tupel col, maximum , minumium to make it sutable to create a datafrma out out tuples
    (event1, 3, 1) 
    """

    columns = preprocessing_df.schema.names
    columns_with_stats = []  # append tuples to a list, later to create a spark df
    for col in columns: # for each column calculate stat values
        maximum = calc_column_func(df, col, 'max')
        minimum = calc_column_func(df, col, 'min')
        mean = calc_column_func(df, col, 'avg')
        columns_with_stats.append((col,maximum, minimum, mean))
    return columns_with_stats 


def drop_garbage_cols(df):
    """
    Drop some of the unnesessary columns
    """
    columns_to_drop = ['level_0', 'index', 'Unnamed: 0', '_c0']
    df_to_drop = df.select('*')
    df_to_drop = df_to_drop.drop(*columns_to_drop)
    
    return df_to_drop


def get_delta_columns_df(joined_df):
    """
    Substract simmilar summary columns (like min, max, mean .. ) for preprocessing df and imputed df 
    """
    joined_df_min = joined_df.withColumn("delta_min", col("min_pre") - col("min"))
    joined_df_min_max = joined_df_min.withColumn("delta_max", col("max_pre") - col("max"))
    joined_df_min_max_mean = joined_df_min_max.withColumn("delta_mean", col("mean_pre") - col("mean"))
    
    return joined_df_min_max_mean
    
 
"""
**** MAIN *****
"""


spark = start_spark_session()
preprocessing_file_name = "../preprocessing_2020_06_30_1.csv"
imputed_file_name = "../imputed_train_2020_06_30_1.csv"
#example_file_name = "../example1.csv"

preprocessing_df = spark.read.format("csv").option("header", "true").load(preprocessing_file_name)
preprocessing_columns_with_stats = get_descriptive_statistics_for_columns(preprocessing_df)
preprocessing_cols_stats_df = spark.createDataFrame( preprocessing_columns_with_stats, ['column','max','min','mean'] )
print(preprocessing_cols_stats_df.count())

imputed_df = spark.read.format("csv").option("header", "true").load(imputed_file_name)
imputed_columns_with_stats = get_descriptive_statistics_for_columns(imputed_df)
imputed_cols_stats_df = spark.createDataFrame( imputed_columns_with_stats, ['column','max','min','mean'] )
print(imputed_cols_stats_df.count())

preprocessing_cols_stats_df_re = preprocessing_cols_stats_df.select(*(col(x).alias(x + '_pre') for x in preprocessing_cols_stats_df.columns))
joined_df = preprocessing_cols_stats_df_re.join(imputed_cols_stats_df, preprocessing_cols_stats_df_re.column_pre == imputed_cols_stats_df.column)

delta_columns_df = get_delta_columns_df(joined_df)
delta_columns_df.select('column','delta_min', 'delta_max', 'delta_mean').show(n=45, truncate= False)


45
45
+----------------------------+---------+--------------------+----------------------+
|column                      |delta_min|delta_max           |delta_mean            |
+----------------------------+---------+--------------------+----------------------+
|expanding_cai_ins_grs_mrc   |0.0      |0.0                 |242.164572469429      |
|yr_month                    |0.0      |2.0                 |8.248874504497508     |
|event_inbound_interactions  |0.0      |0.0                 |0.0038466376698513827 |
|month                       |0.0      |0.0                 |-0.006062903445177348 |
|td_last_cai_ins_grs_vmc     |0.0      |0.0                 |0.0014165845494440443 |
|td_last_cai_ins_grs_vuc     |0.0      |0.0                 |5.114062678986507     |
|cai_ins_grs_mrc             |0.0      |0.0                 |-0.016975163022877382 |
|cai_ins_grs_rand            |0.0      |0.0                 |6.496808137061365E-4  |
|CTU                         |-1.0     |0.0                