# Data Exploration

In [None]:
%load_ext autoreload
%autoreload 2

from assaiku.data import DataConfig, DataPipe
from assaiku.data.validation import load_and_validate
from assaiku.data.exploration import (
    visualize_categorical_dist, 
    visualize_continuous_dist, 
    visualize_correlation,
    analyze_nans, 
    analyze_label_dist,
    visualize_distance
)
from assaiku.data.processing import remove_group_duplicates
import pandas as pd
# import logging

# logging.basicConfig(format='%(asctime)s:%(levelname)-8s:%(name)s:%(message)s', 
#                     level=logging.INFO,
#                     )

pd.set_option('display.max_columns', 50)

data_config = DataConfig(perform_exploration=True)

## Loading and validating data

In [None]:
train_df, test_df = load_and_validate(data_config=data_config)

## Start of the analysis

### Duplicates and NaN

In [None]:
analyze_nans(train_df)

In [None]:
clean_train_df = remove_group_duplicates(train_df,weight_col=data_config.weight_col, remove_age=True)
clean_test_df = remove_group_duplicates(test_df,weight_col=data_config.weight_col, remove_age=True)

In [None]:
# Let's look at the distribution of labels
print("Train")
analyze_label_dist(data=clean_train_df, data_config=data_config)
print("Test")
analyze_label_dist(data=clean_test_df, data_config=data_config)

## Distribution depending on income

### Numerical values

We will analyze how the distribution is different for the two groups of income (namely -50000 and +5000). For this we will first visualize the distributions for each continuous feature for the two groups of income to get a qualitative feeling about which parameter may have an impact on the income. Then we will run a more quantitative study where we will compute the correlation coefficient between the continuous variable and the income value binarized to 0 and 1.

In [None]:
visualize_correlation(data=clean_train_df,
                      data_config=data_config,)

In [None]:
visualize_continuous_dist(data=clean_train_df,
                          data_config=data_config,
                          folder_path="results/exploration/continuous",
                          filter_cols=None,
                          # filter_cols=["age","wage_per_hour"],
                          )

### Categorical Values

In [None]:
visualize_distance(data=clean_train_df, data_config=data_config)

In [None]:
visualize_categorical_dist(data=clean_train_df, 
                           data_config=data_config, 
                           folder_path="results/exploration/categorical",
                           filter_cols=["detailed_industry_recode",
                           "family_members_under_18","sex","veterans_benefits", "reason_for_unemployment"])

In [None]:
data_pipeline = DataPipe(data_config=data_config)
data_pipeline.run()