# ANOVOS - Association Evaluator
Following notebook shows the list of functions related to "asociation evaultion" module provided under ANOVOS package and how it can be invoked accordingly.
- [Correlation Matrix](#Correlation-Matrix)
- [Variable Clustering](#Variable-Clustering)
- [Information Value (IV)](#Information-Value-(IV))
- [Information Gain (IG)](#Information-Gain-(IG))

**Setting Spark Session**

In [2]:
from anovos.shared.spark import *

sc.setLogLevel("ERROR")
import warnings
warnings.filterwarnings('ignore')

**Input/Output Path**

In [3]:
inputPath = "../data/income_dataset/csv"
outputPath = "../output/income_dataset/data_analyzer"

In [4]:
from anovos.data_ingest.data_ingest import read_dataset

In [5]:
df = read_dataset(spark, file_path = inputPath, file_type = "csv",file_configs = {"header": "True", 
                                                                           "delimiter": "," , 
                                                                           "inferSchema": "True"})
df = df.drop("dt_1", "dt_2")
df.toPandas().head(5)

                                                                                

Unnamed: 0,ifa,age,workclass,fnlwgt,logfnl,empty,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,1a,,State-gov,77516.0,4.889391,,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,UnitedStates,<=50K
1,2a,,Self-emp-not-inc,83311.0,4.920702,,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,UnitedStates,<=50K
2,3a,38.0,Private,215646.0,5.333741,,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,UnitedStates,<=50K
3,4a,53.0,Private,234721.0,5.370552,,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,UnitedStates,<=50K
4,5a,,Private,338409.0,5.529442,,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


# Correlation Matrix
- API specification of function **correlation_matrix** can be found <a href="https://docs.anovos.ai/api/data_analyzer/association_evaluator.html">here</a>

In [6]:
from anovos.data_analyzer.association_evaluator import correlation_matrix

In [7]:
# Example 1 - 'all' numerical columns (excluding drop_cols)
odf = correlation_matrix(spark, idf = df, list_of_cols='all', drop_cols=['ifa'])
odf.toPandas()

                                                                                

Unnamed: 0,attribute,age,capital-gain,capital-loss,education-num,fnlwgt,hours-per-week,logfnl
0,age,1.0,0.124524,0.05785,0.067084,-0.07716,0.142408,-0.008094
1,capital-gain,0.124524,1.0,-0.059873,0.11933,-0.006382,0.093433,-0.000687
2,capital-loss,0.05785,-0.059873,1.0,0.074598,-0.007893,0.060063,-0.005958
3,education-num,0.067084,0.11933,0.074598,1.0,-0.03458,0.167042,-0.005273
4,fnlwgt,-0.07716,-0.006382,-0.007893,-0.03458,1.0,-0.021602,0.135897
5,hours-per-week,0.142408,0.093433,0.060063,0.167042,-0.021602,1.0,-0.001097
6,logfnl,-0.008094,-0.000687,-0.005958,-0.005273,0.135897,-0.001097,1.0


In [8]:
# Example 2 - selected columns
odf = correlation_matrix(spark, idf = df, list_of_cols= ['age','capital-gain','fnlwgt'])
odf.toPandas()

Unnamed: 0,attribute,age,capital-gain,fnlwgt
0,age,1.0,0.124524,-0.07716
1,capital-gain,0.124524,1.0,-0.006382
2,fnlwgt,-0.07716,-0.006382,1.0


# Variable Clustering
- API specification of function **variable_clustering** can be found <a href="https://docs.anovos.ai/api/data_analyzer/association_evaluator.html">here</a>
- Valid only on smaller dataset which can fit into pandas dataframe. Sample size can controlled by sample_size argument (default value: 100,000)

In [7]:
from anovos.data_analyzer.association_evaluator import variable_clustering

In [10]:
# Example 1 - with mandatory arguments (rest arguments have default values)
odf = variable_clustering(spark, df)
odf.toPandas()

                                                                                

Unnamed: 0,Cluster,Attribute,RS_Ratio
0,0,marital-status,0.4999
1,0,relationship,0.3522
2,0,sex,0.3373
3,0,hours-per-week,0.8264
4,1,fnlwgt,0.2277
5,1,logfnl,0.2271
6,2,income,0.5765
7,2,education-num,0.4163
8,2,occupation,0.5893
9,2,capital-loss,0.8975


In [11]:
# Example 2 - 'all' columns (excluding drop_cols)
odf = variable_clustering(spark, idf = df, list_of_cols='all', drop_cols=['ifa'])
odf.toPandas()

                                                                                

Unnamed: 0,Cluster,Attribute,RS_Ratio
0,0,marital-status,0.4999
1,0,relationship,0.3522
2,0,sex,0.3373
3,0,hours-per-week,0.8264
4,1,fnlwgt,0.2277
5,1,logfnl,0.2271
6,2,income,0.5765
7,2,education-num,0.4163
8,2,occupation,0.5893
9,2,capital-loss,0.8975


In [12]:
# Example 3 - selected columns
odf = variable_clustering(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'])
odf.toPandas()

                                                                                

Unnamed: 0,Cluster,Attribute,RS_Ratio
0,0,age,0.4587
1,0,fnlwgt,0.827
2,0,workclass,0.5303
3,1,race,0.4607
4,1,sex,0.4618


In [10]:
# Example 4 - only numerical columns (user warning is shown as encoding was not required due to absence of any categorical column)
odf = variable_clustering(spark, idf = df, list_of_cols= ['age','education-num','capital-gain'])
odf.toPandas()

Unnamed: 0,Cluster,Attribute,RS_Ratio
0,0,education-num,0.5839
1,0,capital-gain,0.4886
2,0,age,0.764


In [14]:
# Example 5 - only categorical columns
odf = variable_clustering(spark, idf = df, list_of_cols= ['sex','race','workclass'])
odf.toPandas()

                                                                                

Unnamed: 0,Cluster,Attribute,RS_Ratio
0,0,sex,0.4605
1,0,race,0.4606
2,1,workclass,0.0


In [15]:
# Example 6 - Change in Sample Size
odf = variable_clustering(spark, idf = df, list_of_cols= 'all', sample_size=10000)
odf.toPandas()

                                                                                

Unnamed: 0,Cluster,Attribute,RS_Ratio
0,0,sex,0.3389
1,0,relationship,0.344
2,0,marital-status,0.5157
3,0,hours-per-week,0.8274
4,1,fnlwgt,0.225
5,1,logfnl,0.2245
6,2,income,0.5642
7,2,education-num,0.4338
8,2,occupation,0.6329
9,2,capital-loss,0.8807


In [16]:
# Example 7 - selected columns + presaved stats
from anovos.data_analyzer.stats_generator import measures_of_cardinality, measures_of_centralTendency
from anovos.data_ingest.data_ingest import write_dataset
unique = write_dataset(measures_of_cardinality(spark, df),outputPath+"/unique","parquet", file_configs={"mode":"overwrite"})
mode = write_dataset(measures_of_centralTendency(spark, df),outputPath+"/mode","parquet", file_configs={"mode":"overwrite"})

odf = variable_clustering(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'],
                                  stats_unique={"file_path":outputPath+"/unique", "file_type": "parquet"},
                                  stats_mode={"file_path":outputPath+"/mode", "file_type": "parquet"})
odf.toPandas()

                                                                                

Unnamed: 0,Cluster,Attribute,RS_Ratio
0,0,age,0.4587
1,0,fnlwgt,0.827
2,0,workclass,0.5303
3,1,race,0.4607
4,1,sex,0.4618


# Information Value (IV)
- API specification of function **IV_calculation** can be found <a href="https://docs.anovos.ai/api/data_analyzer/association_evaluator.html">here</a>
- Supports only binary target variable

In [17]:
from anovos.data_analyzer.association_evaluator import IV_calculation

In [18]:
# Example 1 - with mandatory arguments (rest arguments have default values)
odf = IV_calculation(spark, df, label_col='income', event_label=">50K")
odf.toPandas()

                                                                                

Unnamed: 0,attribute,iv
0,relationship,1.5348
1,marital-status,1.339
2,age,1.0793
3,occupation,0.7772
4,education,0.7345
5,education-num,0.6984
6,hours-per-week,0.4499
7,capital-gain,0.3138
8,sex,0.3037
9,workclass,0.1625


In [19]:
# Example 2 - 'all' columns (excluding drop_cols)
odf = IV_calculation(spark, idf = df, list_of_cols='all', drop_cols=['ifa'], label_col='income', event_label=">50K")
odf.toPandas()

Unnamed: 0,attribute,iv
0,relationship,1.5348
1,marital-status,1.339
2,age,1.0793
3,occupation,0.7772
4,education,0.7345
5,education-num,0.6984
6,hours-per-week,0.4499
7,capital-gain,0.3138
8,sex,0.3037
9,workclass,0.1625


In [20]:
# Example 3 - selected columns
odf = IV_calculation(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'], label_col='income', event_label=">50K")
odf.toPandas()

Unnamed: 0,attribute,iv
0,age,1.0793
1,sex,0.3037
2,workclass,0.1625
3,race,0.0697
4,fnlwgt,0.0088


In [21]:
# Example 4 - selected columns + encoding configs (bin method equal_range instead of default equal_frequency )
odf = IV_calculation(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'], label_col='income', 
                    event_label=">50K", encoding_configs={'bin_method': 'equal_range', 
                                                          'bin_size': 10, 'monotonicity_check': 0})
odf.toPandas()

                                                                                

Unnamed: 0,attribute,iv
0,age,1.0436
1,sex,0.3037
2,workclass,0.1625
3,race,0.0697
4,fnlwgt,0.0016


In [22]:
# Example 5 - selected columns + encoding configs (bin_size 20 instead of default 10 )
odf = IV_calculation(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'], label_col='income', 
                    event_label=">50K", encoding_configs={'bin_method': 'equal_frequency', 
                                                          'bin_size': 20, 'monotonicity_check': 0})
odf.toPandas()

Unnamed: 0,attribute,iv
0,age,1.2328
1,sex,0.3037
2,workclass,0.1625
3,race,0.0697
4,fnlwgt,0.0162


In [23]:
# Example 6 - selected columns + encoding configs (monotonicity check )
odf = IV_calculation(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'], label_col='income', 
                    event_label=">50K", encoding_configs={'bin_method': 'equal_frequency', 
                                                          'bin_size': 10, 'monotonicity_check': 1})
odf.toPandas()

Unnamed: 0,attribute,iv
0,age,0.5814
1,sex,0.3037
2,workclass,0.1625
3,race,0.0697
4,fnlwgt,0.0088


# Information Gain (IG)
- API specification of function **IG_calculation** can be found <a href="https://docs.anovos.ai/api/data_analyzer/association_evaluator.html">here</a>
- Supports only binary target variable

In [24]:
from anovos.data_analyzer.association_evaluator import IG_calculation

In [25]:
# Example 1 - with mandatory arguments (rest arguments have default values)
odf = IG_calculation(spark, df, label_col='income', event_label=">50K")
odf.toPandas()

Unnamed: 0,attribute,ig
0,relationship,0.1654
1,marital-status,0.1538
2,age,0.0939
3,occupation,0.0932
4,education-num,0.0883
5,education,0.0871
6,hours-per-week,0.0565
7,capital-gain,0.0429
8,sex,0.0372
9,workclass,0.0217


In [26]:
# Example 2 - 'all' columns (excluding drop_cols)
odf = IG_calculation(spark, idf = df, list_of_cols='all', drop_cols=['ifa'], label_col='income', event_label=">50K")
odf.toPandas()

Unnamed: 0,attribute,ig
0,relationship,0.1654
1,marital-status,0.1538
2,age,0.0939
3,occupation,0.0932
4,education-num,0.0883
5,education,0.0871
6,hours-per-week,0.0565
7,capital-gain,0.0429
8,sex,0.0372
9,workclass,0.0217


In [27]:
# Example 3 - selected columns
odf = IG_calculation(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'], label_col='income', event_label=">50K")
odf.toPandas()

Unnamed: 0,attribute,ig
0,age,0.0939
1,sex,0.0372
2,workclass,0.0217
3,race,0.0061
4,fnlwgt,0.0011


In [28]:
# Example 4 - selected columns + encoding configs (bin method equal_range instead of default equal_frequency )
odf = IG_calculation(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'], label_col='income', 
                    event_label=">50K", encoding_configs={'bin_method': 'equal_range', 
                                                          'bin_size': 10, 'monotonicity_check': 0})
odf.toPandas()

Unnamed: 0,attribute,ig
0,age,0.0917
1,sex,0.0372
2,workclass,0.0217
3,race,0.0061
4,fnlwgt,0.0002


In [29]:
# Example 5 - selected columns + encoding configs (bin_size 20 instead of default 10 )
odf = IG_calculation(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'], label_col='income', 
                    event_label=">50K", encoding_configs={'bin_method': 'equal_frequency', 
                                                          'bin_size': 20, 'monotonicity_check': 0})
odf.toPandas()

Unnamed: 0,attribute,ig
0,age,0.097
1,sex,0.0372
2,workclass,0.0217
3,race,0.0061
4,fnlwgt,0.0021


In [30]:
# Example 6 - selected columns + encoding configs (monotonicity check )
odf = IG_calculation(spark, idf = df, list_of_cols= ['age','sex','race','workclass','fnlwgt'], label_col='income', 
                    event_label=">50K", encoding_configs={'bin_method': 'equal_frequency', 
                                                          'bin_size': 10, 'monotonicity_check': 1})
odf.toPandas()

                                                                                

Unnamed: 0,attribute,ig
0,age,0.0682
1,sex,0.0372
2,workclass,0.0217
3,race,0.0061
4,fnlwgt,0.0011
