In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('adult.csv')

In [4]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [5]:
data['workclass'].value_counts()

Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: workclass, dtype: int64

In [6]:
data['education'].value_counts()

HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: education, dtype: int64

In [17]:
data['native-country'].value_counts()

United-States                 43832
Mexico                          951
?                               857
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Nicaragua                        49
Greece                           49
Peru                        

In [13]:
data['gender'].value_counts()

Male      32650
Female    16192
Name: gender, dtype: int64

In [14]:
data['race'].value_counts()

White                 41762
Black                  4685
Asian-Pac-Islander     1519
Amer-Indian-Eskimo      470
Other                   406
Name: race, dtype: int64

In [32]:
data['occupation'].value_counts()

Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
?                    2809
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: occupation, dtype: int64

Based on initial observation of the data - clear values that will benefit us are: age, workclass, education and most-likely native country of the person as well as occupation (latter can be tightly connected to the education level. We will have to see this visually). Considering the world that we leave in - it will be interesting to see, if such factors that can be affected by discrimination can result in lower pay: race and gender. We will certainly, however will deal with some inaccuracies in this regards as the proportions of diffrent genders and races are way off. Second factor that will contribute to the inaccuracy in models prediction and in our analysis (if there is a discrimination or no) is the fact that the actual output is categorical variable. Either less that 50k or more. Thus, I have decided that we will take a look at probabilities with which certain person might have specific income.

In [58]:
'''
We now proceed to processing data. We begin by managing categorical variables and absent variables that are marked by null
'''

# repalce unknown country with the mode
data['native-country'] = data['native-country'].replace("?", "United-States")
data['workclass'] = data['workclass'].replace("?", "Private")
# We will first try to to substitute countries via label encoding. Later in case accuracy
# will suffer - we will proceed with one hot encoding. One hot encoiding will however add
# quite a bit of columns to the already not so small data set.
data["native-country"] = data["native-country"].astype('category')
data["native-country-cat"] = data["native-country"].cat.codes
# We will do the same to the genders, education and workclass
data["gender"] = data["gender"].astype('category')
data["gender-cat"] = data["gender"].cat.codes
data["education"] = data["education"].astype('category')
data["education-cat"] = data["education"].cat.codes
data["workclass"] = data["workclass"].astype('category')
data["workclass-cat"] = data["workclass"].cat.codes

# We will however, for conveniece have to make a dictionaries with mapping
# quite a bit of copy-paste incoming, but whatever. it is not software engineering
gender_map = {}
education_map = {}
workclass_map = {}
native_country_map = {}

for g in data["gender-cat"].unique():
    d = data[data['gender-cat'] == g]
    key = d.iloc[0]['gender']
    gender_map[key] = g

for g in data["education-cat"].unique():
    d = data[data['education-cat'] == g]
    key = d.iloc[0]['education']
    education_map[key] = g

for g in data["workclass-cat"].unique():
    d = data[data['workclass-cat'] == g]
    key = d.iloc[0]['workclass']
    workclass_map[key] = g
    
for g in data["native-country-cat"].unique():
    d = data[data['native-country-cat'] == g]
    key = d.iloc[0]['native-country']
    native_country_map[key] = g

native_country_map

{'United-States': 38,
 'Peru': 28,
 'Guatemala': 12,
 'Mexico': 25,
 'Dominican-Republic': 5,
 'Ireland': 20,
 'Germany': 10,
 'Philippines': 29,
 'Thailand': 36,
 'Haiti': 13,
 'El-Salvador': 7,
 'Puerto-Rico': 32,
 'Vietnam': 39,
 'South': 34,
 'Columbia': 3,
 'Japan': 23,
 'India': 18,
 'Cambodia': 0,
 'Poland': 30,
 'Laos': 24,
 'England': 8,
 'Cuba': 4,
 'Taiwan': 35,
 'Italy': 21,
 'Canada': 1,
 'Portugal': 31,
 'China': 2,
 'Nicaragua': 26,
 'Honduras': 15,
 'Iran': 19,
 'Scotland': 33,
 'Jamaica': 22,
 'Ecuador': 6,
 'Yugoslavia': 40,
 'Hungary': 17,
 'Hong': 16,
 'Greece': 11,
 'Trinadad&Tobago': 37,
 'Outlying-US(Guam-USVI-etc)': 27,
 'France': 9,
 'Holand-Netherlands': 14}