# ANOVOS - Statistic Generator
Following notebook shows the list of functions related to "stats generator" module provided under ANOVOS package and how it can be invoked accordingly.
* [Global Summary](#Global-Summary)
* [Measures of Counts](#Measures-of-Counts)
* [Measures of Central Tendency](#Measures-of-Central-Tendency)
* [Measures of Cardinality](#Measures-of-Cardinality)
* [Measures of Dispersion](#Measures-of-Dispersion)
* [Measures of Percentiles](#Measures-of-Percentiles)
* [Measures of Shape](#Measures-of-Shape)

**Setting Spark Session**

In [1]:
#set run type variable
run_type = "local" # "local", "emr", "databricks", "ak8s"

In [3]:
#For run_type Azure Kubernetes, run the following block 
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

if run_type == "ak8s":
    auth_key="<insert value of sas_token here>" 
    master_url="<insert conf spark.hadoop.fs master url here> ex: spark.hadoop.fs.azure.sas.<container>.<account_name>.blob.core.windows.net"
    docker_image="<insert name docker image here>"
    kubernetes_namespace ="<insert kubernetes namespace here>"

    # Create Spark config for our Kubernetes based cluster manager
    sparkConf = SparkConf()
    sparkConf.setMaster(master_url)
    sparkConf.setAppName("Anovos_pipeline")
    sparkConf.set("spark.submit.deployMode","client")
    sparkConf.set("spark.kubernetes.container.image", docker_image)
    sparkConf.set("spark.kubernetes.namespace", kubernetes_namespace)
    sparkConf.set("spark.executor.instances", "4")
    sparkConf.set("spark.executor.cores", "4")
    sparkConf.set("spark.executor.memory", "16g")
    sparkConf.set("spark.kubernetes.pyspark.pythonVersion", "3")
    sparkConf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
    sparkConf.set(master_url,auth_key)
    sparkConf.set("spark.kubernetes.authenticate.serviceAccountName", "spark")
    sparkConf.set("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.2.0,com.microsoft.azure:azure-storage:8.6.3,io.github.histogrammar:histogrammar_2.12:1.0.20,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.20,org.apache.spark:spark-avro_2.12:3.2.1")

    # Initialize our Spark cluster, this will actually
    # generate the worker nodes.
    spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
    sc = spark.sparkContext

#For other run types import from anovos.shared.
else:
    from anovos.shared.spark import *
    auth_key = "NA"

In [3]:
sc.setLogLevel("ERROR")
import warnings
warnings.filterwarnings('ignore')

**Input/Output Path**

In [4]:
inputPath = "../data/income_dataset/csv"
outputPath = "../output/income_dataset/data_analyzer"

In [5]:
from anovos.data_ingest.data_ingest import read_dataset

In [6]:
df = read_dataset(spark, file_path = inputPath, file_type = "csv",file_configs = {"header": "True", 
                                                                           "delimiter": "," , 
                                                                           "inferSchema": "True"})
df = df.drop("dt_1", "dt_2")
df.toPandas().head(5)

Unnamed: 0,ifa,age,workclass,fnlwgt,logfnl,empty,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,1a,,State-gov,77516.0,4.889391,,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,UnitedStates,<=50K
1,2a,,Self-emp-not-inc,83311.0,4.920702,,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,UnitedStates,<=50K
2,3a,38.0,Private,215646.0,5.333741,,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,UnitedStates,<=50K
3,4a,53.0,Private,234721.0,5.370552,,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,UnitedStates,<=50K
4,5a,,Private,338409.0,5.529442,,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


# Global Summary
- API specification of function **global_summary** can be found <a href="https://docs.anovos.ai/api/data_analyzer/stats_generator.html">here</a>

In [7]:
from anovos.data_analyzer.stats_generator import global_summary

In [8]:
# Example 1 - with manadatory arguments (rest arguments have default values)
odf = global_summary(spark, df)
odf.toPandas()

[Stage 6:>                                                        (0 + 12) / 12]                                                                                

Unnamed: 0,metric,value
0,rows_count,32561
1,columns_count,18
2,numcols_count,7
3,numcols_name,"education-num, fnlwgt, hours-per-week, logfnl,..."
4,catcols_count,11
5,catcols_name,"education, marital-status, workclass, empty, n..."
6,othercols_count,0
7,othercols_name,


In [9]:
# Example 2 - 'all' columns (excluding drop_cols)
odf = global_summary(spark, idf = df, list_of_cols='all', drop_cols=['ifa'])
odf.toPandas()

Unnamed: 0,metric,value
0,rows_count,32561
1,columns_count,17
2,numcols_count,7
3,numcols_name,"education-num, fnlwgt, hours-per-week, logfnl,..."
4,catcols_count,10
5,catcols_name,"education, marital-status, workclass, empty, n..."
6,othercols_count,0
7,othercols_name,


In [10]:
# Example 3 - selected columns
odf = global_summary(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'])
odf.toPandas()

Unnamed: 0,metric,value
0,rows_count,32561
1,columns_count,5
2,numcols_count,2
3,numcols_name,"fnlwgt, age"
4,catcols_count,3
5,catcols_name,"workclass, sex, race"
6,othercols_count,0
7,othercols_name,


# Measures of Counts

- API specification of function **measures_of_counts** can be found <a href="https://docs.anovos.ai/api/data_analyzer/stats_generator.html">here</a>
- Non zero count/% calculated only for numerical columns

In [11]:
from anovos.data_analyzer.stats_generator import measures_of_counts, nonzeroCount_computation

In [12]:
# Example 1 - with manadatory arguments (rest arguments have default values)
odf = measures_of_counts(spark, df)
odf.toPandas()



Unnamed: 0,attribute,fill_count,fill_pct,missing_count,missing_pct,nonzero_count,nonzero_pct
0,age,32500,0.9981,61,0.0019,32500.0,0.9981
1,capital-gain,32548,0.9996,13,0.0004,2710.0,0.0832
2,capital-loss,32549,0.9996,12,0.0004,1519.0,0.0467
3,education,32040,0.984,521,0.016,,
4,education-num,32530,0.999,31,0.001,32530.0,0.999
5,empty,0,0.0,32561,1.0,,
6,fnlwgt,32546,0.9995,15,0.0005,32546.0,0.9995
7,hours-per-week,32452,0.9967,109,0.0033,32452.0,0.9967
8,ifa,32561,1.0,0,0.0,,
9,income,32561,1.0,0,0.0,,


In [13]:
# Example 2 - 'all' columns (excluding drop_cols)
odf = measures_of_counts(spark, idf = df, list_of_cols='all', drop_cols=['ifa'])
odf.toPandas()



Unnamed: 0,attribute,fill_count,fill_pct,missing_count,missing_pct,nonzero_count,nonzero_pct
0,age,32500,0.9981,61,0.0019,32500.0,0.9981
1,capital-gain,32548,0.9996,13,0.0004,2710.0,0.0832
2,capital-loss,32549,0.9996,12,0.0004,1519.0,0.0467
3,education,32040,0.984,521,0.016,,
4,education-num,32530,0.999,31,0.001,32530.0,0.999
5,empty,0,0.0,32561,1.0,,
6,fnlwgt,32546,0.9995,15,0.0005,32546.0,0.9995
7,hours-per-week,32452,0.9967,109,0.0033,32452.0,0.9967
8,income,32561,1.0,0,0.0,,
9,logfnl,12168,0.3737,20393,0.6263,12168.0,0.3737


In [14]:
# Example 3 - selected columns
odf = measures_of_counts(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'])
odf.toPandas()



Unnamed: 0,attribute,fill_count,fill_pct,missing_count,missing_pct,nonzero_count,nonzero_pct
0,age,32500,0.9981,61,0.0019,32500.0,0.9981
1,fnlwgt,32546,0.9995,15,0.0005,32546.0,0.9995
2,race,32247,0.9904,314,0.0096,,
3,sex,32557,0.9999,4,0.0001,,
4,workclass,32558,0.9999,3,0.0001,,


In [15]:
# Example 4 - only numerical columns
odf = measures_of_counts(spark, idf = df, list_of_cols= ['age','education-num','capital-gain'])
odf.toPandas()



Unnamed: 0,attribute,fill_count,fill_pct,missing_count,missing_pct,nonzero_count,nonzero_pct
0,age,32500,0.9981,61,0.0019,32500,0.9981
1,capital-gain,32548,0.9996,13,0.0004,2710,0.0832
2,education-num,32530,0.999,31,0.001,32530,0.999


In [16]:
# Example 5 - only categorical columns (user warning is shown as nonon-zero computation didn't happen due to absence of any numerical column)
odf = measures_of_counts(spark, idf = df, list_of_cols= ['sex','race','workclass'])
odf.toPandas()

Unnamed: 0,attribute,fill_count,fill_pct,missing_count,missing_pct,nonzero_count,nonzero_pct
0,race,32247,0.9904,314,0.0096,,
1,sex,32557,0.9999,4,0.0001,,
2,workclass,32558,0.9999,3,0.0001,,


# Measures of Central Tendency

- API specification of function **measures_of_centralTendency** can be found <a href="https://docs.anovos.ai/api/data_analyzer/stats_generator.html">here</a>
- Mode & Mode% calculated only for discrete columns (string + integer datatypes)

In [17]:
from anovos.data_analyzer.stats_generator import measures_of_centralTendency

In [18]:
# Example 1 - with manadatory arguments (rest arguments have default values)
odf = measures_of_centralTendency(spark, df)
odf.toPandas()

                                                                                

Unnamed: 0,attribute,mean,median,mode,mode_rows,mode_pct
0,age,38.5065,37.0,36,897.0,0.0276
1,capital-gain,1077.696,0.0,0,29838.0,0.9167
2,capital-loss,87.336,0.0,0,31030.0,0.9533
3,education,,,HS-grad,10490.0,0.3274
4,education-num,10.081,10.0,9,10491.0,0.3225
5,empty,,,,,
6,fnlwgt,189781.8318,178353.0,164190,13.0,0.0004
7,hours-per-week,40.2497,40.0,40,15215.0,0.4688
8,ifa,,,99a,1.0,0.0
9,income,,,<=50K,24720.0,0.7592


In [19]:
# Example 2 - 'all' columns (excluding drop_cols)
odf = measures_of_centralTendency(spark, idf = df, list_of_cols='all', drop_cols=['ifa'])
odf.toPandas()

                                                                                

Unnamed: 0,attribute,mean,median,mode,mode_rows,mode_pct
0,age,38.5065,37.0,36,897.0,0.0276
1,capital-gain,1077.696,0.0,0,29838.0,0.9167
2,capital-loss,87.336,0.0,0,31030.0,0.9533
3,education,,,HS-grad,10490.0,0.3274
4,education-num,10.081,10.0,9,10491.0,0.3225
5,empty,,,,,
6,fnlwgt,189781.8318,178353.0,164190,13.0,0.0004
7,hours-per-week,40.2497,40.0,40,15215.0,0.4688
8,income,,,<=50K,24720.0,0.7592
9,logfnl,5.2055,5.2524,,,


In [20]:
# Example 3 - selected columns
odf = measures_of_centralTendency(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'])
odf.toPandas()



Unnamed: 0,attribute,mean,median,mode,mode_rows,mode_pct
0,age,38.5065,37.0,36,897,0.0276
1,fnlwgt,189781.8318,178353.0,164190,13,0.0004
2,race,,,White,27791,0.8618
3,sex,,,Male,21783,0.6691
4,workclass,,,Private,22685,0.6968


In [21]:
# Example 4 - only numerical columns
odf = measures_of_centralTendency(spark, idf = df, list_of_cols= ['age','education-num','capital-gain','logfnl'])
odf.toPandas()

Unnamed: 0,attribute,mean,median,mode,mode_rows,mode_pct
0,age,38.5065,37.0,36.0,897.0,0.0276
1,capital-gain,1077.696,0.0,0.0,29838.0,0.9167
2,education-num,10.081,10.0,9.0,10491.0,0.3225
3,logfnl,5.2055,5.2524,,,


In [22]:
# Example 5 - only categorical columns
odf = measures_of_centralTendency(spark, idf = df, list_of_cols= ['sex','race','workclass'])
odf.toPandas()

                                                                                

Unnamed: 0,attribute,mean,median,mode,mode_rows,mode_pct
0,race,,,White,27791,0.8618
1,sex,,,Male,21783,0.6691
2,workclass,,,Private,22685,0.6968


# Measures of Cardinality

- API specification of function **measures_of_cardinality** can be found <a href="https://docs.anovos.ai/api/data_analyzer/stats_generator.html">here</a>
- Calculated only for discrete columns (string + integer datatypes)

In [23]:
from anovos.data_analyzer.stats_generator import measures_of_cardinality

In [24]:
# Example 1 - with manadatory arguments (rest arguments have default values)
odf = measures_of_cardinality(spark, df)
odf.toPandas()

Unnamed: 0,attribute,unique_values,IDness
0,age,68,0.0021
1,capital-gain,120,0.0037
2,capital-loss,93,0.0029
3,education,15,0.0005
4,education-num,16,0.0005
5,empty,0,
6,fnlwgt,20074,0.6168
7,hours-per-week,88,0.0027
8,ifa,32353,0.9936
9,income,2,0.0001


In [25]:
# Example 2 - with manadatory arguments and use_approx_unique_count as False (rest arguments have default values)
# This flag will ensure to use exact unique values to compute IDness
odf = measures_of_cardinality(spark, df, use_approx_unique_count=False)
odf.toPandas()

Unnamed: 0,attribute,unique_values,IDness
0,age,69,0.0021
1,capital-gain,119,0.0037
2,capital-loss,92,0.0028
3,education,16,0.0005
4,education-num,16,0.0005
5,empty,0,
6,fnlwgt,21640,0.6649
7,hours-per-week,89,0.0027
8,ifa,32561,1.0
9,income,2,0.0001


In [26]:
# Example 3 - with manadatory arguments and rsd=0.02 (rest arguments have default values)
# rsd(relative standard deviation) is used when use_approx_unique_count is True which is True by default 
# rsd is 0.05 be default and lesser value(0.02) will compute more accurate unique values
odf = measures_of_cardinality(spark, df, rsd=0.02)
odf.toPandas()

                                                                                

Unnamed: 0,attribute,unique_values,IDness
0,age,69,0.0021
1,capital-gain,119,0.0037
2,capital-loss,93,0.0029
3,education,16,0.0005
4,education-num,16,0.0005
5,empty,0,
6,fnlwgt,21665,0.6657
7,hours-per-week,89,0.0027
8,ifa,32410,0.9954
9,income,2,0.0001


In [27]:
# Example 4 - 'all' columns (excluding drop_cols)
odf = measures_of_cardinality(spark, idf = df, list_of_cols='all', drop_cols=['ifa'])
odf.toPandas()

Unnamed: 0,attribute,unique_values,IDness
0,age,68,0.0021
1,capital-gain,120,0.0037
2,capital-loss,93,0.0029
3,education,15,0.0005
4,education-num,16,0.0005
5,empty,0,
6,fnlwgt,20074,0.6168
7,hours-per-week,88,0.0027
8,income,2,0.0001
9,marital-status,7,0.0002


In [28]:
# Example 5 - selected columns
odf = measures_of_cardinality(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'])
odf.toPandas()

Unnamed: 0,attribute,unique_values,IDness
0,age,68,0.0021
1,fnlwgt,20074,0.6168
2,race,9,0.0003
3,sex,3,0.0001
4,workclass,10,0.0003


In [29]:
# Example 6 - only numerical columns
odf = measures_of_cardinality(spark, idf = df, list_of_cols= ['age','education-num','capital-gain','logfnl'])
odf.toPandas()

Unnamed: 0,attribute,unique_values,IDness
0,age,68,0.0021
1,capital-gain,120,0.0037
2,education-num,16,0.0005
3,logfnl,10519,0.8645


In [30]:
# Example 7 - only categorical columns
odf = measures_of_cardinality(spark, idf = df, list_of_cols= ['sex','race','workclass'])
odf.toPandas()

Unnamed: 0,attribute,unique_values,IDness
0,race,9,0.0003
1,sex,3,0.0001
2,workclass,10,0.0003


# Measures of Dispersion

- API specification of function **measures_of_dispersion** can be found <a href="https://docs.anovos.ai/api/data_analyzer/stats_generator.html">here</a>
- Supports only numerical columns

In [31]:
from anovos.data_analyzer.stats_generator import measures_of_dispersion

In [32]:
# Example 1 - with manadatory arguments (rest arguments have default values)
odf = measures_of_dispersion(spark, df)
odf.toPandas()

Unnamed: 0,attribute,stddev,variance,cov,IQR,range
0,age,13.5085,182.4796,0.3508,20.0,68.0
1,capital-gain,7386.6249,54562230.0,6.8541,0.0,99999.0
2,capital-loss,403.031,162434.0,4.6147,0.0,4356.0
3,education-num,2.5725,6.6178,0.2552,3.0,15.0
4,fnlwgt,105563.0645,11143560000.0,0.5562,119179.0,1472420.0
5,hours-per-week,11.9143,141.9505,0.296,5.0,93.0
6,logfnl,0.2742,0.0752,0.0527,0.3052,1.8051


In [33]:
# Example 2 - 'all' columns (excluding drop_cols)
odf = measures_of_dispersion(spark, idf = df, list_of_cols='all', drop_cols=['capital-loss'])
odf.toPandas()

Unnamed: 0,attribute,stddev,variance,cov,IQR,range
0,age,13.5085,182.4796,0.3508,20.0,68.0
1,capital-gain,7386.6249,54562230.0,6.8541,0.0,99999.0
2,education-num,2.5725,6.6178,0.2552,3.0,15.0
3,fnlwgt,105563.0645,11143560000.0,0.5562,119179.0,1472420.0
4,hours-per-week,11.9143,141.9505,0.296,5.0,93.0
5,logfnl,0.2742,0.0752,0.0527,0.3052,1.8051


In [34]:
# Example 3 - selected numerical columns
odf = measures_of_dispersion(spark, idf = df, list_of_cols= ['age','education-num','capital-gain','logfnl'])
odf.toPandas()

Unnamed: 0,attribute,stddev,variance,cov,IQR,range
0,age,13.5085,182.4796,0.3508,20.0,68.0
1,capital-gain,7386.6249,54562230.0,6.8541,0.0,99999.0
2,education-num,2.5725,6.6178,0.2552,3.0,15.0
3,logfnl,0.2742,0.0752,0.0527,0.3052,1.8051


# Measures of Percentiles

- API specification of function **measures_of_percentiles** can be found <a href="https://docs.anovos.ai/api/data_analyzer/stats_generator.html">here</a>
- Supports only numerical columns

In [35]:
from anovos.data_analyzer.stats_generator import measures_of_percentiles

In [36]:
# Example 1 - with manadatory arguments (rest arguments have default values)
odf = measures_of_percentiles(spark, df)
odf.toPandas()

Unnamed: 0,attribute,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
0,age,17.0,17.0,19.0,22.0,28.0,37.0,48.0,58.0,63.0,73.0,85.0
1,capital-gain,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5013.0,15024.0,99999.0
2,capital-loss,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1980.0,4356.0
3,education-num,1.0,3.0,5.0,7.0,9.0,10.0,12.0,13.0,14.0,16.0,16.0
4,fnlwgt,12285.0,27153.0,39460.0,65706.0,117814.0,178353.0,236993.0,329026.0,379522.0,509866.0,1484705.0
5,hours-per-week,1.0,8.0,18.0,24.0,40.0,40.0,45.0,55.0,60.0,72.0,94.0
6,logfnl,4.2836,4.4322,4.5937,4.8203,5.0729,5.2524,5.3781,5.5178,5.5768,5.7073,6.0887


In [37]:
# Example 2 - 'all' columns (excluding drop_cols)
odf = measures_of_percentiles(spark, idf = df, list_of_cols='all', drop_cols=['capital-gain'])
odf.toPandas()

Unnamed: 0,attribute,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
0,age,17.0,17.0,19.0,22.0,28.0,37.0,48.0,58.0,63.0,73.0,85.0
1,capital-loss,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1980.0,4356.0
2,education-num,1.0,3.0,5.0,7.0,9.0,10.0,12.0,13.0,14.0,16.0,16.0
3,fnlwgt,12285.0,27153.0,39460.0,65706.0,117814.0,178353.0,236993.0,329026.0,379522.0,509866.0,1484705.0
4,hours-per-week,1.0,8.0,18.0,24.0,40.0,40.0,45.0,55.0,60.0,72.0,94.0
5,logfnl,4.2836,4.4322,4.5937,4.8203,5.0729,5.2524,5.3781,5.5178,5.5768,5.7073,6.0887


In [38]:
# Example 3 - selected numerical columns
odf = measures_of_percentiles(spark, idf = df, list_of_cols= ['age','education-num','capital-gain','logfnl'])
odf.toPandas()

Unnamed: 0,attribute,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
0,age,17.0,17.0,19.0,22.0,28.0,37.0,48.0,58.0,63.0,73.0,85.0
1,capital-gain,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5013.0,15024.0,99999.0
2,education-num,1.0,3.0,5.0,7.0,9.0,10.0,12.0,13.0,14.0,16.0,16.0
3,logfnl,4.2836,4.4322,4.5937,4.8203,5.0729,5.2524,5.3781,5.5178,5.5768,5.7073,6.0887


# Measures of Shape

- API specification of function **measures_of_shape** can be found <a href="https://docs.anovos.ai/api/data_analyzer/stats_generator.html">here</a>
- Supports only numerical columns

In [39]:
from anovos.data_analyzer.stats_generator import measures_of_shape

In [40]:
# Example 1 - with manadatory arguments (rest arguments have default values)
odf = measures_of_shape(spark, df)
odf.toPandas()

Unnamed: 0,attribute,skewness,kurtosis
0,education-num,-0.3116,0.6236
1,fnlwgt,1.447,6.217
2,hours-per-week,-0.0756,1.9953
3,logfnl,-0.854,0.8365
4,capital-gain,11.9516,154.7243
5,age,0.5128,-0.3418
6,capital-loss,4.5935,20.3642


In [41]:
# Example 2 - 'all' columns (excluding drop_cols)
odf = measures_of_shape(spark, idf = df, list_of_cols='all', drop_cols=['capital-gain'])
odf.toPandas()

Unnamed: 0,attribute,skewness,kurtosis
0,education-num,-0.3116,0.6236
1,fnlwgt,1.447,6.217
2,hours-per-week,-0.0756,1.9953
3,logfnl,-0.854,0.8365
4,age,0.5128,-0.3418
5,capital-loss,4.5935,20.3642


In [42]:
# Example 3 - selected numerical columns
odf = measures_of_shape(spark, idf = df, list_of_cols= ['age','education-num','capital-gain','logfnl'])
odf.toPandas()

Unnamed: 0,attribute,skewness,kurtosis
0,logfnl,-0.854,0.8365
1,age,0.5128,-0.3418
2,capital-gain,11.9516,154.7243
3,education-num,-0.3116,0.6236
