In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import numpy as np

In [2]:
df = pd.read_csv('data/adult.data', header=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num'
                                                                   , 'marital-status', 'occupation', 'relationship', 
                                                                   'race', 'sex', 'capital-gain' , 'capital-loss', 
                                                                   'hours-per-week', 'native-country', 'income'])
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
print("Length of dataset=", df.shape)

Length of dataset= (32561, 15)


In [4]:
df = df.replace(' ?', np.NaN)
df = df.dropna()
print("Length of dataset=", len(df))

Length of dataset= 30162


In [5]:
def encode_labels(df):
    le = preprocessing.LabelEncoder()
    le.fit(df['workclass'].unique())
    print(le.classes_)
    df['workclass'] = le.transform(df['workclass']) 
    df.head()


    le.fit(df['education'].unique())
    df['education'] = le.transform(df['education']) 

    le.fit(df['marital-status'].unique())
    df['marital-status'] = le.transform(df['marital-status']) 

    le.fit(df['occupation'].unique())
    df['occupation'] = le.transform(df['occupation']) 


    le.fit(df['relationship'].unique())
    df['relationship'] = le.transform(df['relationship']) 


    le.fit(df['race'].unique())
    df['race'] = le.transform(df['race']) 

    le.fit(df['sex'].unique())
    df['sex'] = le.transform(df['sex']) 

    le.fit(df['native-country'].unique())
    df['native-country'] = le.transform(df['native-country']) 

    le.fit(df['income'].unique())
    df['income'] = le.transform(df['income']) 

    df.head()
    return df

In [6]:
df = encode_labels(df)
df.head()

[' Federal-gov' ' Local-gov' ' Private' ' Self-emp-inc' ' Self-emp-not-inc'
 ' State-gov' ' Without-pay']


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,5,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,4,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,2,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,2,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,2,338409,9,13,2,9,5,2,0,0,0,40,4,0


In [7]:
df_test = pd.read_csv('data/adult_test.txt', sep=",", header=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num'
                                                                   , 'marital-status', 'occupation', 'relationship', 
                                                                   'race', 'sex', 'capital-gain' , 'capital-loss', 
                                                                   'hours-per-week', 'native-country', 'income'])
df_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [8]:
df_test = df_test.replace(' ?', np.NaN)
df_test = df_test.dropna()
print("Length of dataset=", len(df_test))

Length of dataset= 15060


In [9]:
df_test = encode_labels(df_test)
df_test.head()

[' Federal-gov' ' Local-gov' ' Private' ' Self-emp-inc' ' Self-emp-not-inc'
 ' State-gov' ' Without-pay']


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,2,226802,1,7,4,6,3,2,1,0,0,40,37,0
1,38,2,89814,11,9,2,4,0,4,1,0,0,50,37,0
2,28,1,336951,7,12,2,10,0,4,1,0,0,40,37,1
3,44,2,160323,15,10,2,6,0,2,1,7688,0,40,37,1
5,34,2,198693,0,6,4,7,1,4,1,0,0,30,37,0


In [10]:
df['binned_age'] = pd.cut(df['age'], bins=3, labels=[0, 1, 2])
series_age, bins_of_age = pd.cut(df['age'], bins=3, retbins=True, labels=False)
print(bins_of_age)
df.head()
df_test['binned_age'] = pd.cut(df_test['age'], bins=bins_of_age, labels=[0, 1, 2])
df_test.head()

[ 16.927       41.33333333  65.66666667  90.        ]


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,binned_age
0,25,2,226802,1,7,4,6,3,2,1,0,0,40,37,0,0
1,38,2,89814,11,9,2,4,0,4,1,0,0,50,37,0,0
2,28,1,336951,7,12,2,10,0,4,1,0,0,40,37,1,0
3,44,2,160323,15,10,2,6,0,2,1,7688,0,40,37,1,1
5,34,2,198693,0,6,4,7,1,4,1,0,0,30,37,0,0


In [11]:
df['binned_fnlwgt'] = pd.cut(df['fnlwgt'], bins=3, labels=[0, 1, 2])
series_fnlwgt, bins_of_fnlwgt = pd.cut(df['fnlwgt'], bins=3, retbins=True, labels=False)
df_test['binned_fnlwgt'] = pd.cut(df_test['fnlwgt'], bins=bins_of_fnlwgt, labels=[0, 1, 2])

In [12]:
df['binned_capital-gain'] = pd.cut(df['capital-gain'], bins=3, labels=[0, 1, 2])
series_loss, bins_of_cp = pd.cut(df['capital-gain'], bins=3, retbins=True, labels=False)
df_test['binned_capital-gain'] = pd.cut(df_test['capital-gain'], bins=bins_of_cp, labels=[0, 1, 2])

In [13]:
df['binned_capital-loss'] = pd.cut(df['capital-loss'], bins=3, labels=[0, 1, 2])
series_loss, bins_of_loss = pd.cut(df['capital-loss'], bins=3, retbins=True, labels=False)
df_test['binned_capital-loss'] = pd.cut(df_test['capital-loss'], bins=bins_of_loss, labels=[0, 1, 2])

In [14]:
df['binned_hours-per-week'] = pd.cut(df['hours-per-week'], bins=3, labels=[0, 1, 2])
series_hp, bins_of_hp = pd.cut(df['hours-per-week'], bins=3, retbins=True, labels=False)
df_test['binned_hours-per-week'] = pd.cut(df_test['hours-per-week'], bins=bins_of_hp, labels=[0, 1, 2])

In [15]:
df_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,binned_age,binned_fnlwgt,binned_capital-gain,binned_capital-loss,binned_hours-per-week
0,25,2,226802,1,7,4,6,3,2,1,0,0,40,37,0,0,0,0,0,1
1,38,2,89814,11,9,2,4,0,4,1,0,0,50,37,0,0,0,0,0,1
2,28,1,336951,7,12,2,10,0,4,1,0,0,40,37,1,0,0,0,0,1
3,44,2,160323,15,10,2,6,0,2,1,7688,0,40,37,1,1,0,0,0,1
5,34,2,198693,0,6,4,7,1,4,1,0,0,30,37,0,0,0,0,0,0


In [20]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,binned_age,binned_fnlwgt,binned_capital-gain,binned_capital-loss,binned_hours-per-week
0,39,5,77516,9,13,4,0,1,4,1,2174,0,40,38,0,0,0,0,0,1
1,50,4,83311,9,13,2,3,0,4,1,0,0,13,38,0,1,0,0,0,0
2,38,2,215646,11,9,0,5,1,4,1,0,0,40,38,0,0,0,0,0,1
3,53,2,234721,1,7,2,5,0,2,1,0,0,40,38,0,1,0,0,0,1
4,28,2,338409,9,13,2,9,5,2,0,0,0,40,4,0,0,0,0,0,1
