In [26]:
"""
Zone 5 stage I aggregation step 2
Task description	Automation
After aggregating data to the day-level, create a table with row-wise listing of:	
1. Event column name	Table1
2. Total number of events for column over total number of events for ALL columns	Table1
	
Rationale: Proportion of events for a particular event column. Expected to be similar to zone 4 proportions	

Table 1	
	
	Num events / Total num events 
Column 1	2%
Column 2	5%
Column 3	3%
Column 4	5%
Column 5	1%
…	…
Example 
Date  	Purchase	Calls	Money spend
1/1/00	0	3	50
1/2/00	1	0	51
1/3/00	0	4	52
1/4/00	0	0	32
1/5/00	1	1	0
	   0.33	0.6	0.80

"""

from sys import argv
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

def start_spark_session():
    """
    Starting spark session
    """

    spark = SparkSession \
        .builder \
        .appName("Python Spark") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    return spark 

def drop_garbage_cols(df):
    """
    Drop some of the unnesessary columns
    """
    columns_to_drop = ['level_0', 'index', 'Unnamed: 0', '_c0', 'party_id', 'event_date', 'CTU', 'event_id']
    df_to_drop = df.select('*')
    df_to_drop = df_to_drop.drop(*columns_to_drop)
    
    return df_to_drop

"""
*** MAIN ***
"""

#file_name = "../general_preprocessing_2020_06_30_1.csv"
file_name = "../preprocessing_2020_06_30_1.csv"
spark = start_spark_session()
gen_pre_df = spark.read.format("csv").option("header", "true").load(file_name) 
num_rows = gen_pre_df.count()
event_rate_df = gen_pre_df.select([(F.count(F.when(gen_pre_df[c] != 0, c))/num_rows).alias(c) for c in gen_pre_df.columns])
event_rate_df_clean =  drop_garbage_cols( event_rate_df)
event_rate_df_clean.toPandas().transpose().reset_index().rename(columns={0:'Column event rate ', 'index' : 'Column names'})

Unnamed: 0,Column names,Column event rate
0,cai_ins_grs_vmc,0.858343
1,cai_ins_grs_mrc,0.882095
2,cai_ins_grs_erc,0.934077
3,cai_ins_grs_evmc,0.107688
4,cai_ins_grs_vuc,0.049201
5,cai_ins_grs_evnt_1,0.109933
6,cai_ins_grs_evnt_2,0.109616
7,cai_ins_grs_evnt_3,0.109616
8,cai_ins_grs_rand,0.31759
9,cai_factor_age,0.998861
