## UCI Adult Income Dataset - Exploratory and Descriptive Analysis
 This note book is focused on the exploratory and descriptive analysis of the cleaned Version

In [5]:
import pandas as pd 
import numpy as np 
import os 
import plotly.express as px

## Define and create Paths

In [12]:
current_dir= os.getcwd()
#go one directory up to the root directory
project_root_dir =os.path.dirname(current_dir)
#Define paths to the data file
data_dir = os.path.join(project_root_dir,"data")
raw_dir = os.path.join(data_dir,"raw")
processed_dir = os.path.join(data_dir,'processed')
#Define paths to the result fold
results_dir = os.path.join(project_root_dir,"result") 
#define path to docs folderabs
docs_dir = os.path.join(project_root_dir,"docs")

#create directory If they do not exist 
os.makedirs(raw_dir,exist_ok=True)
os.makedirs(processed_dir,exist_ok=True)
os.makedirs(results_dir,exist_ok=True)
os.makedirs(docs_dir,exist_ok=True)

## Read in the data

In [18]:
adult_data_filename = os.path.join(processed_dir, 'adults_cleaned.csv')
adult_df = pd.read_csv(adult_data_filename)
adult_df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education_num,martial_status,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,incame,education_level,occupation_grouped,age_group
0,39,government,77516,13,single,single,white,male,2174,0,40,united-states,<=50k,tertiary,white collar,36-45
1,50,self-employed,83311,13,married,male spouse,white,male,0,0,13,united-states,<=50k,tertiary,white collar,46-60
2,38,private,215646,9,divorced or separated,single,white,male,0,0,40,united-states,<=50k,highschoolgraduate,blue collar,36-45
3,53,private,234721,7,married,male spouse,black,male,0,0,40,united-states,<=50k,secondary,blue collar,46-60
4,28,private,338409,13,married,female spouse,black,female,0,0,40,cuba,<=50k,tertiary,white collar,26-35
5,37,private,284582,14,married,female spouse,white,female,0,0,40,united-states,<=50k,tertiary,white collar,36-45
6,49,private,160187,5,divorced or separated,single,black,female,0,0,16,jamaica,<=50k,secondary,service,46-60
7,52,self-employed,209642,9,married,male spouse,white,male,0,0,45,united-states,>50k,highschoolgraduate,white collar,46-60
8,31,private,45781,14,single,single,white,female,14084,0,50,united-states,>50k,tertiary,white collar,26-35
9,42,private,159449,13,married,male spouse,white,male,5178,0,40,united-states,>50k,tertiary,white collar,36-45


### Check the shape of the dataset and datatypes

In [22]:
adult_df.shape

(32514, 16)

In [31]:
adult_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32514 entries, 0 to 32513
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   age                 32514 non-null  int64 
 1   workclass           32514 non-null  object
 2   fnlwgt              32514 non-null  int64 
 3   education_num       32514 non-null  int64 
 4   martial_status      32514 non-null  object
 5   relationship        32514 non-null  object
 6   race                32514 non-null  object
 7   sex                 32514 non-null  object
 8   capital_gain        32514 non-null  int64 
 9   capital_loss        32514 non-null  int64 
 10  hours_per_week      32514 non-null  int64 
 11  native_country      32514 non-null  object
 12  incame              32514 non-null  object
 13  education_level     32514 non-null  object
 14  occupation_grouped  32514 non-null  object
 15  age_group           32514 non-null  object
dtypes: int64(6), object(10

## Summary statistics

In [26]:
adult_df[adult_df.duplicated(keep=False)]

Unnamed: 0,age,workclass,fnlwgt,education_num,martial_status,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,incame,education_level,occupation_grouped,age_group


In [28]:
adult_df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32514.0,32514.0,32514.0,32514.0,32514.0,32514.0
mean,38.589684,189791.6,10.081626,1079.206619,87.43003,40.440026
std,13.639112,105578.2,2.571975,7390.514416,403.237687,12.351147
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117828.5,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237049.2,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [33]:
adult_df.describe(include='object')

Unnamed: 0,workclass,martial_status,relationship,race,sex,native_country,incame,education_level,occupation_grouped,age_group
count,32514,32514,32514,32514,32514,32514,32514,32514,32514,32514
unique,6,4,5,5,2,42,2,7,5,7
top,private,married,male spouse,white,male,united-states,<=50k,highschoolgraduate,white collar,26-35
freq,22651,14984,13178,27772,21758,29131,24678,10484,16533,8501


In [37]:
adult_df['workclass'].value_counts(normalize=True)

workclass
private          0.696654
self-employed    0.112444
government       0.069416
local-gov        0.064372
unknown          0.056468
voluntary        0.000646
Name: proportion, dtype: float64