# ANOVOS - Association Evaluator
Following notebook shows the list of functions related to "asociation evaultion" module provided under ANOVOS package and how it can be invoked accordingly.
- [Correlation Matrix Numerical](#Correlation-Matrix-Numerical)
- [Variable Clustering](#Variable-Clustering)
- [Information Value (IV)](#Information-Value-(IV))
- [Information Gain (IG)](#Information-Gain-(IG))

**Setting Spark Session**

In [1]:
#set run type variable
run_type = "local" # "local", "emr", "databricks", "ak8s"

In [4]:
#For run_type Azure Kubernetes, run the following block 
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

if run_type == "ak8s":
    fs_path="<insert conf spark.hadoop.fs master url here> ex: spark.hadoop.fs.azure.sas.<container>.<account_name>.blob.core.windows.net"
    auth_key="<insert value of sas_token here>"
    master_url="<insert kubernetes master url path here> ex: k8s://"
    docker_image="<insert name docker image here>"
    kubernetes_namespace ="<insert kubernetes namespace here>"

    # Create Spark config for our Kubernetes based cluster manager
    sparkConf = SparkConf()
    sparkConf.setMaster(master_url)
    sparkConf.setAppName("Anovos_pipeline")
    sparkConf.set("spark.submit.deployMode","client")
    sparkConf.set("spark.kubernetes.container.image", docker_image)
    sparkConf.set("spark.kubernetes.namespace", kubernetes_namespace)
    sparkConf.set("spark.executor.instances", "4")
    sparkConf.set("spark.executor.cores", "4")
    sparkConf.set("spark.executor.memory", "16g")
    sparkConf.set("spark.kubernetes.pyspark.pythonVersion", "3")
    sparkConf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
    sparkConf.set(fs_path,auth_key)
    sparkConf.set("spark.kubernetes.authenticate.serviceAccountName", "spark")
    sparkConf.set("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.2.0,com.microsoft.azure:azure-storage:8.6.3,io.github.histogrammar:histogrammar_2.12:1.0.20,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.20,org.apache.spark:spark-avro_2.12:3.2.1")

    # Initialize our Spark cluster, this will actually
    # generate the worker nodes.
    spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
    sc = spark.sparkContext

#For other run types import from anovos.shared.
else:
    from anovos.shared.spark import *
    auth_key = "NA"

In [5]:
sc.setLogLevel("ERROR")
import warnings
warnings.filterwarnings('ignore')

**Input/Output Path**

In [6]:
inputPath = "../data/income_dataset/csv"
outputPath = "../output/income_dataset/data_analyzer"

In [7]:
from anovos.data_ingest.data_ingest import read_dataset

In [8]:
df = read_dataset(spark, file_path = inputPath, file_type = "csv",file_configs = {"header": "True", 
                                                                           "delimiter": "," , 
                                                                           "inferSchema": "True"})
df = df.drop("dt_1", "dt_2")
df.toPandas().head(5)

                                                                                

Unnamed: 0,ifa,age,workclass,fnlwgt,logfnl,empty,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,1a,,State-gov,77516.0,4.889391,,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,UnitedStates,<=50K
1,2a,,Self-emp-not-inc,83311.0,4.920702,,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,UnitedStates,<=50K
2,3a,38.0,Private,215646.0,5.333741,,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,UnitedStates,<=50K
3,4a,53.0,Private,234721.0,5.370552,,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,UnitedStates,<=50K
4,5a,,Private,338409.0,5.529442,,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


# Correlation Matrix Numerical
- API specification of function **correlation_matrix** can be found <a href="https://docs.anovos.ai/api/data_analyzer/association_evaluator.html">here</a>
- Supports only numerical columns

In [9]:
from anovos.data_analyzer.association_evaluator import correlation_matrix

In [10]:
# Example 1 - 'all' columns (excluding drop_cols) --- only numerical columns will be part of computation
odf = correlation_matrix(spark, idf = df, list_of_cols='all', drop_cols=['ifa'])
odf.toPandas()

                                                                                

Unnamed: 0,attribute,age,capital-gain,capital-loss,education-num,fnlwgt,hours-per-week,logfnl
0,age,1.0,0.059398,0.05802,0.025746,-0.071992,0.058243,-0.055896
1,capital-gain,0.059398,1.0,-0.031916,0.121075,-6.6e-05,0.078393,0.005187
2,capital-loss,0.05802,-0.031916,1.0,0.082367,-0.012684,0.060584,-0.005488
3,education-num,0.025746,0.121075,0.082367,1.0,-0.047031,0.140318,-0.034246
4,fnlwgt,-0.071992,-6.6e-05,-0.012684,-0.047031,1.0,-0.017192,0.90571
5,hours-per-week,0.058243,0.078393,0.060584,0.140318,-0.017192,1.0,-0.02246
6,logfnl,-0.055896,0.005187,-0.005488,-0.034246,0.90571,-0.02246,1.0


In [11]:
# Example 2 - selected numerical columns
odf = correlation_matrix(spark, idf = df, list_of_cols= ['age','logfnl','education-num','fnlwgt'])
odf.toPandas()

Unnamed: 0,attribute,age,education-num,fnlwgt,logfnl
0,age,1.0,0.0258,-0.072672,-0.056534
1,education-num,0.0258,1.0,-0.046924,-0.034896
2,fnlwgt,-0.072672,-0.046924,1.0,0.905643
3,logfnl,-0.056534,-0.034896,0.905643,1.0


In [12]:
# Example 3 - 'all' columns (including categorical cols, excluding drop_cols) 
# First convert all categorical cols to numerical cols -> cat_to_num_unsupervised/cat_to_num_supervised
# cat_to_num_unsupervised -> method_type: "label_encoding" or "onehot_encoding"

from anovos.data_transformer.transformers import cat_to_num_unsupervised

idf_all_num = cat_to_num_unsupervised(spark, df, method_type="onehot_encoding")

odf = correlation_matrix(spark, idf = idf_all_num, list_of_cols= 'all', drop_cols=['ifa'])
odf.toPandas()

                                                                                

Unnamed: 0,attribute,age,capital-gain,capital-loss,education-num,education_0,education_1,education_10,education_11,education_12,...,workclass_10,workclass_11,workclass_2,workclass_3,workclass_4,workclass_5,workclass_6,workclass_7,workclass_8,workclass_9
0,age,1.000000,0.059398,0.058020,0.025746,0.031963,-0.119270,-0.064797,0.063016,0.027606,...,,,0.068201,0.030426,0.014875,0.098717,0.048277,0.003465,-0.017097,0.007184
1,capital-gain,0.059398,1.000000,-0.031916,0.121075,-0.040080,-0.035414,-0.012972,0.052292,-0.011173,...,,,-0.008508,-0.019083,-0.011074,0.105693,-0.010083,-0.001851,-0.001851,-0.001309
2,capital-loss,0.058020,-0.031916,1.000000,0.082367,-0.038491,-0.021324,-0.020288,0.033402,0.007298,...,,,0.028010,-0.025092,-0.004368,0.032944,0.016328,-0.002854,-0.002854,-0.002018
3,education-num,0.025746,0.121075,0.082367,1.000000,-0.292463,-0.014933,-0.086739,0.256739,-0.272220,...,,,0.094153,-0.076070,0.113206,0.069006,0.060320,-0.018041,-0.025623,-0.018117
4,education_0,0.031963,-0.040080,-0.038491,-0.292463,1.000000,-0.380180,-0.074256,-0.076703,-0.068223,...,,,-0.042959,-0.022358,-0.057158,-0.025512,-0.013095,0.004765,-0.008955,-0.006332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,workclass_5,0.098717,0.105693,0.032944,0.069006,-0.025512,-0.004361,-0.015901,0.029050,-0.004560,...,,,-0.049514,-0.047183,-0.039528,1.000000,-0.032254,-0.002435,-0.002435,-0.001722
128,workclass_6,0.048277,-0.010083,0.016328,0.060320,-0.013095,0.006702,-0.008705,0.017939,-0.016702,...,,,-0.044575,-0.042476,-0.035584,-0.032254,1.000000,-0.002192,-0.002192,-0.001550
129,workclass_7,0.003465,-0.001851,-0.002854,-0.018041,0.004765,-0.007027,-0.001373,-0.001418,-0.001261,...,,,-0.003365,-0.003207,-0.002687,-0.002435,-0.002192,1.000000,-0.000166,-0.000117
130,workclass_8,-0.017097,-0.001851,-0.002854,-0.025623,-0.008955,-0.007027,-0.001373,-0.001418,-0.001261,...,,,-0.003365,-0.003207,-0.002687,-0.002435,-0.002192,-0.000166,1.000000,-0.000117


# Variable Clustering
- API specification of function **variable_clustering** can be found <a href="https://docs.anovos.ai/api/data_analyzer/association_evaluator.html">here</a>
- Valid only on smaller dataset which can fit into pandas dataframe. Sample size can controlled by sample_size argument (default value: 100,000)

In [10]:
from anovos.data_analyzer.association_evaluator import variable_clustering

In [11]:
# Example 1 - with mandatory arguments (rest arguments have default values)
odf = variable_clustering(spark, df)
odf.toPandas()

                                                                                

Unnamed: 0,Cluster,Attribute,RS_Ratio
0,0,relationship,0.3526
1,0,hours-per-week,0.8264
2,0,marital-status,0.4999
3,0,sex,0.3369
4,1,fnlwgt,0.2277
5,1,logfnl,0.2273
6,2,income,0.5764
7,2,occupation,0.5893
8,2,education-num,0.4163
9,2,capital-loss,0.8975


In [12]:
# Example 2 - 'all' columns (excluding drop_cols)
odf = variable_clustering(spark, idf = df, list_of_cols='all', drop_cols=['ifa'])
odf.toPandas()

Unnamed: 0,Cluster,Attribute,RS_Ratio
0,0,relationship,0.3526
1,0,hours-per-week,0.8264
2,0,marital-status,0.4999
3,0,sex,0.3369
4,1,fnlwgt,0.2277
5,1,logfnl,0.2271
6,2,income,0.5764
7,2,education-num,0.4163
8,2,occupation,0.5893
9,2,capital-loss,0.8975


In [13]:
# Example 3 - selected columns
odf = variable_clustering(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'])
odf.toPandas()

Unnamed: 0,Cluster,Attribute,RS_Ratio
0,0,age,0.4587
1,0,fnlwgt,0.827
2,0,workclass,0.5303
3,1,race,0.4607
4,1,sex,0.4618


In [14]:
# Example 4 - only numerical columns (user warning is shown as encoding was not required due to absence of any categorical column)
odf = variable_clustering(spark, idf = df, list_of_cols= ['age','education-num','capital-gain'])
odf.toPandas()

Unnamed: 0,Cluster,Attribute,RS_Ratio
0,0,education-num,0.5839
1,0,capital-gain,0.4886
2,0,age,0.764


In [15]:
# Example 5 - only categorical columns
odf = variable_clustering(spark, idf = df, list_of_cols= ['sex','race','workclass'])
odf.toPandas()

Unnamed: 0,Cluster,Attribute,RS_Ratio
0,0,race,0.4606
1,0,sex,0.4605
2,1,workclass,0.0


In [16]:
# Example 6 - Change in Sample Size
odf = variable_clustering(spark, idf = df, list_of_cols= 'all', sample_size=10000)
odf.toPandas()

Unnamed: 0,Cluster,Attribute,RS_Ratio
0,0,relationship,0.3586
1,0,hours-per-week,0.8462
2,0,marital-status,0.4953
3,0,sex,0.3328
4,1,fnlwgt,0.2224
5,1,logfnl,0.2217
6,2,income,0.5328
7,2,occupation,0.6609
8,2,education-num,0.4581
9,2,capital-loss,0.8752


In [17]:
# Example 7 - selected columns + presaved stats
from anovos.data_analyzer.stats_generator import measures_of_cardinality, measures_of_centralTendency
from anovos.data_ingest.data_ingest import write_dataset
unique = write_dataset(measures_of_cardinality(spark, df),outputPath+"/unique","parquet", file_configs={"mode":"overwrite"})
mode = write_dataset(measures_of_centralTendency(spark, df),outputPath+"/mode","parquet", file_configs={"mode":"overwrite"})

odf = variable_clustering(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'],
                                  stats_unique={"file_path":outputPath+"/unique", "file_type": "parquet"},
                                  stats_mode={"file_path":outputPath+"/mode", "file_type": "parquet"})
odf.toPandas()

                                                                                

Unnamed: 0,Cluster,Attribute,RS_Ratio
0,0,age,0.4587
1,0,fnlwgt,0.827
2,0,workclass,0.5303
3,1,race,0.4607
4,1,sex,0.4618


# Information Value (IV)
- API specification of function **IV_calculation** can be found <a href="https://docs.anovos.ai/api/data_analyzer/association_evaluator.html">here</a>
- Supports only binary target variable

In [18]:
from anovos.data_analyzer.association_evaluator import IV_calculation

In [19]:
# Example 1 - with mandatory arguments (rest arguments have default values)
odf = IV_calculation(spark, df, label_col='income', event_label=">50K")
odf.toPandas()

                                                                                

Unnamed: 0,attribute,iv
0,logfnl,0.004765
1,ifa,2.197225
2,empty,0.0
3,education,0.741737
4,education-num,0.698399
5,native-country,0.079474
6,age,1.070382
7,capital-loss,0.085989
8,relationship,1.535187
9,sex,0.303669


In [20]:
# Example 2 - 'all' columns (excluding drop_cols)
odf = IV_calculation(spark, idf = df, list_of_cols='all', drop_cols=['ifa'], label_col='income', event_label=">50K")
odf.toPandas()

                                                                                

Unnamed: 0,attribute,iv
0,logfnl,0.004765
1,empty,0.0
2,education,0.741737
3,education-num,0.698399
4,native-country,0.079474
5,age,1.070382
6,capital-loss,0.085989
7,relationship,1.535187
8,sex,0.303669
9,race,0.07038


In [21]:
# Example 3 - selected columns
odf = IV_calculation(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'], label_col='income', event_label=">50K")
odf.toPandas()

                                                                                

Unnamed: 0,attribute,iv
0,age,1.070382
1,sex,0.303669
2,race,0.07038
3,workclass,0.164161
4,fnlwgt,0.008708


In [22]:
# Example 4 - selected columns + encoding configs (bin method equal_range instead of default equal_frequency )
odf = IV_calculation(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'], label_col='income', 
                    event_label=">50K", encoding_configs={'bin_method': 'equal_range', 
                                                          'bin_size': 10, 'monotonicity_check': 0})
odf.toPandas()

Unnamed: 0,attribute,iv
0,age,1.043629
1,sex,0.303669
2,race,0.07038
3,workclass,0.164161
4,fnlwgt,0.001681


In [23]:
# Example 5 - selected columns + encoding configs (bin_size 20 instead of default 10 )
odf = IV_calculation(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'], label_col='income', 
                    event_label=">50K", encoding_configs={'bin_method': 'equal_frequency', 
                                                          'bin_size': 20, 'monotonicity_check': 0})
odf.toPandas()

                                                                                

Unnamed: 0,attribute,iv
0,age,1.15924
1,sex,0.303669
2,race,0.07038
3,workclass,0.164161
4,fnlwgt,0.015972


In [24]:
# Example 6 - selected columns + encoding configs (monotonicity check )
odf = IV_calculation(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'], label_col='income', 
                    event_label=">50K", encoding_configs={'bin_method': 'equal_frequency', 
                                                          'bin_size': 10, 'monotonicity_check': 1})
odf.toPandas()

Unnamed: 0,attribute,iv
0,age,0.584167
1,sex,0.303669
2,race,0.07038
3,workclass,0.164161
4,fnlwgt,0.008708


# Information Gain (IG)
- API specification of function **IG_calculation** can be found <a href="https://docs.anovos.ai/api/data_analyzer/association_evaluator.html">here</a>
- Supports only binary target variable

In [25]:
from anovos.data_analyzer.association_evaluator import IG_calculation

In [26]:
# Example 1 - with mandatory arguments (rest arguments have default values)
odf = IG_calculation(spark, df, label_col='income', event_label=">50K")
odf.toPandas()

                                                                                

Unnamed: 0,attribute,ig
0,logfnl,0.000623
1,ifa,
2,empty,0.0
3,education,0.093215
4,education-num,0.088261
5,native-country,0.008757
6,age,0.09352
7,capital-loss,0.011885
8,relationship,0.165397
9,sex,0.037219


In [27]:
# Example 2 - 'all' columns (excluding drop_cols)
odf = IG_calculation(spark, idf = df, list_of_cols='all', drop_cols=['ifa'], label_col='income', event_label=">50K")
odf.toPandas()

                                                                                

Unnamed: 0,attribute,ig
0,logfnl,0.000623
1,empty,0.0
2,education,0.093215
3,education-num,0.088261
4,native-country,0.008757
5,age,0.09352
6,capital-loss,0.011885
7,relationship,0.165397
8,sex,0.037219
9,race,0.008574


In [28]:
# Example 3 - selected columns
odf = IG_calculation(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'], label_col='income', event_label=">50K")
odf.toPandas()

Unnamed: 0,attribute,ig
0,age,0.09352
1,sex,0.037219
2,race,0.008574
3,workclass,0.021718
4,fnlwgt,0.001143


In [29]:
# Example 4 - selected columns + encoding configs (bin method equal_range instead of default equal_frequency )
odf = IG_calculation(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'], label_col='income', 
                    event_label=">50K", encoding_configs={'bin_method': 'equal_range', 
                                                          'bin_size': 10, 'monotonicity_check': 0})
odf.toPandas()

Unnamed: 0,attribute,ig
0,age,0.091797
1,sex,0.037219
2,race,0.008574
3,workclass,0.021718
4,fnlwgt,0.000245


In [30]:
# Example 5 - selected columns + encoding configs (bin_size 20 instead of default 10 )
odf = IG_calculation(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'], label_col='income', 
                    event_label=">50K", encoding_configs={'bin_method': 'equal_frequency', 
                                                          'bin_size': 20, 'monotonicity_check': 0})
odf.toPandas()

                                                                                

Unnamed: 0,attribute,ig
0,age,0.096753
1,sex,0.037219
2,race,0.008574
3,workclass,0.021718
4,fnlwgt,0.002102


In [31]:
# Example 6 - selected columns + encoding configs (monotonicity check )
odf = IG_calculation(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'], label_col='income', 
                    event_label=">50K", encoding_configs={'bin_method': 'equal_frequency', 
                                                          'bin_size': 10, 'monotonicity_check': 1})
odf.toPandas()

Unnamed: 0,attribute,ig
0,age,0.06878
1,sex,0.037219
2,race,0.008574
3,workclass,0.021718
4,fnlwgt,0.001143
