In [1]:
# Step 0. Load libraries and custom functions
# Matrices and dataframes
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
# Data pre-processing
from sklearn.model_selection import train_test_split

In [2]:
# Step 1. Load data and prepare
# This data was extracted from the census bureau database found at
# http://www.census.gov/ftp/pub/DES/www/welcome.html
# Donor: Ronny Kohavi and Barry Becker,
#        Data Mining and Visualization
#        Silicon Graphics.
#
#         e-mail: ronnyk@sgi.com for questions.
# Extraction was done by Barry Becker from the 1994 Census database.  A set of
#   reasonably clean records was extracted using the following conditions:
#   ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0))
#
# Prediction task is to determine whether a person makes over 50K
# a year.
# Variables
# age: continuous.
# workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, 
# Local-gov, State-gov, Without-pay, Never-worked.
# fnlwgt: continuous.
# education: Bachelors, Some-college, 11th, HS-grad, Prof-school, 
# Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, 
# Doctorate, 5th-6th, Preschool.
# education-num: continuous.
# marital-status: Married-civ-spouse, Divorced, Never-married, 
# Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
# occupation: Tech-support, Craft-repair, Other-service, Sales, 
# Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, 
# Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, 
# Protective-serv, Armed-Forces.
# relationship: Wife, Own-child, Husband, Not-in-family, 
# Other-relative, Unmarried.
# race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
# sex: Female, Male.
# capital-gain: continuous.
# capital-loss: continuous.
# hours-per-week: continuous.
# native-country: United-States, Cambodia, England, Puerto-Rico, 
# Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, 
# South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, 
# Jamaica, Vietnam, Mexico, Portugal, Ireland, France, 
# Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, 
# Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, 
# Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
# Get the data of names file with the following snipet
# url_headers: https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names
# f = requests.get(url_headers)
# print(f.text)

url_base = 'https://archive.ics.uci.edu/ml/machine-learning-databases/' 
url_dir = 'adult/adult.data'
url_data = url_base + url_dir
df_raw = pd.read_csv(url_data, header=None)
df_raw.columns = ['age','workclass','fnlwgt','education',\
    'education_num','marital_status','occupation','relationship',\
        'race','sex','capital_gain','capital_loss','hours_per_week',\
            'native_country','income']

In [3]:
# 1.1 Get basic info
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
# 1.2 Get a sample
df_raw.sample(10)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
28866,43,State-gov,28451,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,37,United-States,>50K
21904,42,Private,280410,HS-grad,9,Married-civ-spouse,Other-service,Husband,Black,Male,0,0,40,Haiti,<=50K
258,45,Private,187666,Assoc-voc,11,Widowed,Exec-managerial,Not-in-family,White,Female,0,0,45,United-States,<=50K
24569,35,Private,150042,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,>50K
12419,24,Private,182342,Bachelors,13,Never-married,Sales,Own-child,White,Male,0,0,40,United-States,<=50K
19146,53,Local-gov,99064,9th,5,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
9039,18,Private,163332,HS-grad,9,Never-married,Sales,Own-child,White,Female,0,0,22,United-States,<=50K
8704,70,Private,102610,Some-college,10,Divorced,Other-service,Not-in-family,White,Male,0,0,80,United-States,<=50K
13125,66,Private,142624,Assoc-acdm,12,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,5556,0,40,Yugoslavia,>50K
20136,44,Private,377018,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,>50K


In [5]:
# 1.3 Transform the data so values are consistent
df_interim = df_raw.copy()
df_interim = df_interim.drop(['education_num'], axis=1)
df_interim = df_interim.replace(['?'],'Unknown')
df_interim['workclass'] = df_interim['workclass'].astype('category')
df_interim['education'] = df_interim['education'].astype('category')
df_interim['marital_status'] = df_interim['marital_status'].astype('category')
df_interim['occupation'] = df_interim['occupation'].astype('category')
df_interim['relationship'] = df_interim['relationship'].astype('category')
df_interim['race'] = df_interim['race'].astype('category')
df_interim['sex'] = df_interim['sex'].astype('category')
df_interim['native_country'] = df_interim['native_country'].astype('category')
df_interim['income'] = df_interim['workclass'].astype('category')

In [9]:
# 2. Exploratory Data Analysis
# 2.1 Divide the dataset into a train and test sets, to proceed EDA
df = df_interim.copy()
X = df.drop(['income'], axis=1)
y = df[['income']]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=123)
df_train = pd.concat([X_train,y_train], axis=1)