In [1]:
import numpy as np
import pandas as pd
import sklearn.preprocessing as preprocessing

### Load raw data

In [7]:
data = pd.read_csv("../data/raw_data/adult.data", index_col=False, delimiter=",")
data_test = pd.read_csv("../data/raw_data/adult.test", index_col=False)

In [3]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Calculate auxiliary variables
Age_binary_young and age_binary_old were created to serve as possible target attributes for targeted label-flipping (we ended up not using them). They are included in the processed dataset, but are not used by the decision-tree learner.

In [8]:
age_binary_young = pd.Series(0 for x in range(len(data)))
age_binary_young_test = pd.Series(0 for x in range(len(data)))
age_binary_old = pd.Series(0 for x in range(len(data)))
age_binary_old_test = pd.Series(0 for x in range(len(data)))

for index,row in data.iterrows():
    if row['age'] < 35:
        age_binary_young[index] = 1
    elif row['age'] > 59:
        age_binary_old[index] = 1
        
    if str.strip(row['sex']) == "Female":
        data.loc[index,'sex'] = 1
    else:
        data.loc[index,'sex'] = 0
        
    if str.strip(row['income']) == "<=50K":
        data.loc[index,'income'] = 0
    else:
        data.loc[index,'income'] = 1

for index,row in data_test.iterrows():
    if row['age'] < 35:
        age_binary_young_test[index] = 1
    elif row['age'] > 59:
        age_binary_old_test[index] = 1
        
    if str.strip(row['sex']) == "Female":
        data_test.loc[index,'sex'] = 1
    else:
        data_test.loc[index,'sex'] = 0
    
    if str.strip(row['income']) == "<=50K.":
        data_test.loc[index,'income'] = 0
    elif str.strip(row['income']) == ">50K.":
        data_test.loc[index,'income'] = 1
    else:
        print("index: ",index)

data['age_binary_old'] = age_binary_old
data['age_binary_young'] = age_binary_young

data_test['age_binary_old'] = age_binary_old_test
data_test['age_binary_young'] = age_binary_young_test


### Pre-processing categorical variables
Group native country by region (note: takes ~30 seconds to run)

In [9]:
europe = ['England','France','Germany','Greece','Holand-Netherlands','Hungary',
               'Ireland', 'Italy','Poland','Portugal','Scotland','Yugoslavia']
n_am = ['Canada','United-States']
asia = ['Cambodia','China','Hong','India','Iran','Japan','Laos','Philippines',
        'South','Taiwan','Thailand','Vietnam']
lat_am = ['Cuba','Columbia','Dominican-Republic','El-Salvador','Ecuador','Guatemala',
          'Haiti','Honduras', 'Jamaica','Mexico','Nicaragua','Peru','Trinadad&Tobago']
other = ['?']
other_us = ['Outlying-US(Guam-USVI-etc)', 'Puerto-Rico']

for index,row in data.iterrows():
    if str.strip(row['native-country']) in europe:
        data.loc[index,'native-country'] = "europe"
    elif str.strip(row['native-country']) in n_am:
        data.loc[index,'native-country'] = "us_can"
    elif str.strip(row['native-country']) in asia:
        data.loc[index,'native-country'] = "asia"
    elif str.strip(row['native-country']) in lat_am:
        data.loc[index,'native-country'] = "lat_am"
    elif str.strip(row['native-country']) in other_us:
        data.loc[index,'native-country'] = "us_terr"
    elif str.strip(row['native-country'])in other:
        data.loc[index,'native-country'] = "unknown"
    else:
        print(row['native-country'])
        
for index,row in data_test.iterrows():
    if str.strip(row['native-country']) in europe:
        data_test.loc[index,'native-country'] = "europe"
    elif str.strip(row['native-country']) in n_am:
        data_test.loc[index,'native-country'] = "us_can"
    elif str.strip(row['native-country']) in asia:
        data_test.loc[index,'native-country'] = "asia"
    elif str.strip(row['native-country']) in lat_am:
        data_test.loc[index,'native-country'] = "lat_am"
    elif str.strip(row['native-country']) in other_us:
        data_test.loc[index,'native-country'] = "us_terr"
    elif str.strip(row['native-country']) in other:
        data_test.loc[index,'native-country'] = "unknown"
    else:
        print(row['native-country'])

Change workclass, education, married, race, occupations, relationships, and countries to numerical values. (Takes around 1 minute to run)

Numerical assignments for workclass, occupations, relationships, and countries are arbitrary. We tried using one-hot encodings to represent these features, but this made Antidote-P take an unreasonably large amount of time.

In [10]:
workclass = {'?' : 0, 'Private' : 1, 'Self-emp-inc' : 2, 'Self-emp-not-inc' : 3, 'Local-gov' : 4,
             'State-gov' : 5, 'Federal-gov' : 6, 'Without-pay' : 7, 'Never-worked': 8}
education = {'Preschool' : 0, '1st-4th' : 1, '5th-6th' : 2, '7th-8th' : 3, '9th' : 4, '10th' : 5, 
             '11th' : 6, '12th' : 7, 'HS-grad' : 8, 'Some-college' : 9, 'Assoc-voc': 10,
             'Assoc-acdm' : 11, 'Bachelors' : 12, 'Masters' : 13, 'Prof-school' : 14, 'Doctorate' : 15}
# married: switch to binary married/not
married = ['Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse'] 
# race: binary white/not
white = ['White']
occupations = {'?': 0, 'Adm-clerical': 1, 'Armed-Forces': 2, 'Craft-repair': 3, 'Exec-managerial': 4, 
               'Farming-fishing': 5, 'Handlers-cleaners' : 6, 'Machine-op-inspct': 7, 'Other-service': 8,
               'Priv-house-serv': 9, 'Prof-specialty': 10, 'Protective-serv' : 11, 'Sales' : 12, 
                'Tech-support' : 13, 'Transport-moving' : 14 }
relationships = {'Husband' : 0, 'Not-in-family' : 1, 'Other-relative' : 2, 'Own-child' : 3, 'Wife' : 5,
                 'Unmarried' : 6}
countries = {"us_can" : 0,"europe" : 1, "us_terr" : 2, "asia" : 3, "lat_am" : 4, "unknown" : 5}

for index,row in data.iterrows():
    workclass_id = workclass[str.strip(row['workclass'])]
    data.loc[index,'workclass'] = workclass_id
    
    education_id = education[str.strip(row['education'])]
    data.loc[index,'education'] = education_id
    
    if str.strip(row['marital-status']) in married:
        data.loc[index, 'marital-status'] = 1
    else:
        data.loc[index, 'marital-status'] = 0
    
    if str.strip(row['race']) in white:
        data.loc[index, 'race'] = 0
    else:
        data.loc[index,'race'] = 1
    
    occupation_id = occupations[str.strip(row['occupation'])]
    data.loc[index,'occupation'] = occupation_id
    
    relationship_id = relationships[str.strip(row['relationship'])]
    data.loc[index,'relationship'] = relationship_id
    
    country_id = countries[str.strip(row['native-country'])]
    data.loc[index,'native-country'] = country_id

In [11]:
# update dataframe (also takes a minute)
for index,row in data_test.iterrows():
    workclass_id = workclass[str.strip(row['workclass'])]
    data_test.loc[index,'workclass'] = workclass_id
    
    education_id = education[str.strip(row['education'])]
    data_test.loc[index,'education'] = education_id
    
    if str.strip(row['marital-status']) in married:
        data_test.loc[index, 'marital-status'] = 1
    else:
        data_test.loc[index, 'marital-status'] = 0
    
    if str.strip(row['race']) in white:
        data_test.loc[index, 'race'] = 0
    else:
        data_test.loc[index,'race'] = 1
    
    occupation_id = occupations[str.strip(row['occupation'])]
    data_test.loc[index,'occupation'] = occupation_id
    
    relationship_id = relationships[str.strip(row['relationship'])]
    data_test.loc[index,'relationship'] = relationship_id
    
    country_id = countries[str.strip(row['native-country'])]
    data_test.loc[index,'native-country'] = country_id

In [18]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,age_binary_old,age_binary_young
0,39,5,77516,12,13,0,1,1,0,0,2174,0,40,0,0,0,0
1,50,3,83311,12,13,1,4,0,0,0,0,0,13,0,0,0,0
2,38,1,215646,8,9,0,6,1,0,0,0,0,40,0,0,0,0
3,53,1,234721,6,7,1,6,0,1,0,0,0,40,0,0,0,0
4,28,1,338409,12,13,1,10,5,1,1,0,0,40,4,0,0,1


In [19]:
data_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,age_binary_old,age_binary_young
0,25,1,226802,6,7,0,7,3,1,0,0,0,40,0,0,0,1
1,38,1,89814,8,9,1,5,0,0,0,0,0,50,0,0,0,0
2,28,4,336951,11,12,1,11,0,0,0,0,0,40,0,1,0,1
3,44,1,160323,9,10,1,7,0,1,0,7688,0,40,0,1,0,0
4,18,0,103497,9,10,0,0,3,0,1,0,0,30,0,0,0,1


In [34]:
data.to_csv("../data/adult_income.data",",",index=False, header=False)

In [48]:
data_test.to_csv("../data/adult_income.test",",",index=False,header=False)

In [20]:
len(data)

32561

In [21]:
len(data_test)

16281