# Exploratory Data Analysis

In [1]:
import sys
import pandas as pd
from pathlib import Path

current_dir = Path.cwd()
src_directory = current_dir.parent
sys.path.append(str(src_directory))

from income_predict_d100_d400 import preprocessing, cleaning

In [2]:
parquet_path = src_directory / "data" / "census_income.parquet"

df_raw = pd.read_parquet(parquet_path)
print(df_raw.head())


   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0             

In [3]:
df_raw.info()
df_raw.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
 14  income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [4]:
#1. Describe your data.
preprocessing.get_data_description(df_raw)

Saved plot to: /app/src/notebooks/data/plots/plot_001.png
Saved plot to: /app/src/notebooks/data/plots/plot_002.png
Saved plot to: /app/src/notebooks/data/plots/plot_003.png
Saved plot to: /app/src/notebooks/data/plots/plot_004.png
Saved plot to: /app/src/notebooks/data/plots/plot_005.png
Saved plot to: /app/src/notebooks/data/plots/plot_006.png
Saved plot to: /app/src/notebooks/data/plots/plot_007.png
Saved plot to: /app/src/notebooks/data/plots/plot_008.png
Saved plot to: /app/src/notebooks/data/plots/plot_009.png
Saved plot to: /app/src/notebooks/data/plots/plot_010.png
Saved plot to: /app/src/notebooks/data/plots/plot_011.png
Saved plot to: /app/src/notebooks/data/plots/plot_012.png
Saved plot to: /app/src/notebooks/data/plots/plot_013.png
Saved plot to: /app/src/notebooks/data/plots/plot_014.png
Saved plot to: /app/src/notebooks/data/plots/plot_015.png


{'dtypes': age                int64
 workclass         object
 fnlwgt             int64
 education         object
 education-num      int64
 marital-status    object
 occupation        object
 relationship      object
 race              object
 sex               object
 capital-gain       int64
 capital-loss       int64
 hours-per-week     int64
 native-country    object
 income            object
 dtype: object,
 'description':                  age workclass        fnlwgt education  education-num  \
 count   48842.000000     47879  4.884200e+04     48842   48842.000000   
 unique           NaN         9           NaN        16            NaN   
 top              NaN   Private           NaN   HS-grad            NaN   
 freq             NaN     33906           NaN     15784            NaN   
 mean       38.643585       NaN  1.896641e+05       NaN      10.078089   
 std        13.710510       NaN  1.056040e+05       NaN       2.570973   
 min        17.000000       NaN  1.228500e+04      

In [5]:
#2. What is the distribution of the target variable?
preprocessing.get_target_distribution(df_raw, 'income')


Saved plot to: /app/src/notebooks/data/plots/plot_016.png


Unnamed: 0_level_0,Count,Percent
income,Unnamed: 1_level_1,Unnamed: 2_level_1
<=50K,24720,50.612178
<=50K.,12435,25.459645
>50K,7841,16.053806
>50K.,3846,7.87437


In [6]:
#3. Do we face outliers and missing values?
preprocessing.get_outliers_summary(df_raw)

Saved plot to: /app/src/notebooks/data/plots/plot_017.png


Unnamed: 0,Column,Outlier Count,Percent,Lower Bound,Upper Bound,Missing Values
5,hours-per-week,13496.0,27.631956,32.5,52.5,0
3,capital-gain,4035.0,8.261332,0.0,0.0,0
4,capital-loss,2282.0,4.672208,0.0,0.0,0
2,education-num,1794.0,3.673068,4.5,16.5,0
1,fnlwgt,1453.0,2.974899,-62586.75,417779.25,0
0,age,216.0,0.442242,-2.0,78.0,0
6,workclass,,,,,963
7,occupation,,,,,966
8,native-country,,,,,274


In [7]:
# Cleaning pipeline
df_clean = cleaning.full_clean(df_raw)
df_clean.columns.tolist()

['unique_id',
 'age',
 'work_class',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'hours_per_week',
 'native_country',
 'high_income',
 'capital_net',
 'is_white',
 'is_black',
 'is_female',
 'age_x_education']

In [8]:
#4. How do specific features correlate with the target variable?
preprocessing.get_feature_correlations(df_clean, 'high_income')

Saved plot to: /app/src/notebooks/data/plots/plot_018.png
Saved plot to: /app/src/notebooks/data/plots/plot_019.png
Saved plot to: /app/src/notebooks/data/plots/plot_020.png
Saved plot to: /app/src/notebooks/data/plots/plot_021.png


{'numeric': age_x_education    0.389900
 education          0.332613
 age                0.230369
 hours_per_week     0.227687
 capital_net        0.214356
 is_white           0.083710
 unique_id         -0.001475
 is_black          -0.090448
 is_female         -0.214628
 Name: high_income, dtype: float64,
 'categorical': relationship      0.454381
 marital_status    0.448302
 occupation        0.345725
 is_female         0.214577
 work_class        0.164041
 native_country    0.096966
 is_black          0.090367
 is_white          0.083642
 dtype: float64}