# Income Prediction Analysis

### Import statements

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout

Using TensorFlow backend.


### Clean data and create dataframe:

In [2]:
adult_cols = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
              "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", 
              "hours-per-week", "native-country", "income"]

adult_df = pd.read_csv("input/adult.data", names=adult_cols)

In [3]:
# Convert income to numerical (0 : <=50K, 1 : >50K)

income_map = {' <=50K': 0, ' >50K': 1}
adult_df["income"] = adult_df["income"].map(income_map)

In [4]:
# Trim white spaces

adult_df = adult_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
adult_df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0


### Split the dataframe into train, validation, and test sets:

In [5]:
train, test = train_test_split(adult_df, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

20838 train examples
5210 validation examples
6513 test examples


### Create an input pipeline using tf.data

In [6]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('income')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [7]:
batch_size = 10 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

### Understand the input pipeline

In [8]:
for feature_batch, label_batch in train_ds.take(1):
    print('Every feature:', list(feature_batch.keys()))
    print('A batch of ages:', feature_batch['age'])
    print('A batch of targets:', label_batch )

Every feature: ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
A batch of ages: tf.Tensor([42 26 37 23 52 38 22 38 70 52], shape=(10,), dtype=int32)
A batch of targets: tf.Tensor([1 0 0 0 1 0 0 0 0 1], shape=(10,), dtype=int32)


### Feature selection:

In [9]:
# Check column types
adult_df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income             int64
dtype: object

In [10]:
adult_df["native-country"].value_counts()

United-States                 29170
Mexico                          643
?                               583
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
Greece                      

In [11]:
feature_columns = []

**Numeric columns:**

In [12]:
for header in ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']:
    feature_columns.append(feature_column.numeric_column(header))

**Categorical columns:**

In [13]:
# Workclass
workclass_values = ['Private','Self-emp-not-inc','Local-gov','?','State-gov','Self-emp-inc',
                    'Federal-gov','Without-pay','Never-worked']
workclass = feature_column.categorical_column_with_vocabulary_list(
      'workclass', workclass_values)
workclass_one_hot = feature_column.indicator_column(workclass)

feature_columns.append(workclass_one_hot)

# Education
education_values = ['HS-grad','Some-college','Bachelors','Masters','Assoc-voc','11th','Assoc-acdm',
                     '10th','7th-8th','Prof-school','9th','12th','Doctorate','5th-6th','1st-4th','Preschool']
education = feature_column.categorical_column_with_vocabulary_list(
      'education', education_values)
education_one_hot = feature_column.indicator_column(education)

feature_columns.append(education_one_hot)

# Marital Status
marital_status_values = ['Married-civ-spouse','Never-married','Divorced','Separated','Widowed',
                         'Married-spouse-absent','Married-AF-spouse']
marital_status = feature_column.categorical_column_with_vocabulary_list(
      'marital-status', marital_status_values)
marital_status_one_hot = feature_column.indicator_column(marital_status)

feature_columns.append(marital_status_one_hot)

# Occupation
occupation_values = ['Prof-specialty', 'Craft-repair', 'Exec-managerial', 'Adm-clerical', 'Sales', 'Other-service',
                'Machine-op-inspct', '?', 'Transport-moving', 'Handlers-cleaners', 'Farming-fishing', 'Tech-support',
                'Protective-serv', 'Priv-house-serv', 'Armed-Forces']
occupation = feature_column.categorical_column_with_vocabulary_list(
      'occupation', occupation_values)
occupation_one_hot = feature_column.indicator_column(occupation)

feature_columns.append(occupation_one_hot)

# Relationship
relationship_values = ['Husband','Not-in-family','Own-child','Unmarried','Wife','Other-relative']
relationship = feature_column.categorical_column_with_vocabulary_list(
      'relationship', relationship_values)
relationship_one_hot = feature_column.indicator_column(relationship)

feature_columns.append(relationship_one_hot)

# Race
race_values = ['White','Black','Asian-Pac-Islander','Amer-Indian-Eskimo','Other']
race = feature_column.categorical_column_with_vocabulary_list(
      'race', race_values)
race_one_hot = feature_column.indicator_column(race)

feature_columns.append(race_one_hot)

# Sex
sex_values = ['Male','Female']
sex = feature_column.categorical_column_with_vocabulary_list(
      'sex', sex_values)
sex_one_hot = feature_column.indicator_column(sex)

feature_columns.append(sex_one_hot)

**Bucketized columns:**

In [14]:
age = feature_column.numeric_column("age")
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

**Embedded columns:**

In [15]:
occupation_embedding = feature_column.embedding_column(occupation, dimension=8)
feature_columns.append(occupation_embedding)

**Crossed columns:**

In [16]:
crossed_feature = feature_column.crossed_column([age_buckets, occupation], hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

### Create the feature layer

In [17]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [18]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

### Create and train the model:

In [19]:
model = tf.keras.Sequential([
    feature_layer,
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=20)

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x14572cc50>

In [20]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.79824966


In [21]:
d = {'age': [40], 'workclass': ['Private'], 'fnlwgt': [154374], 'education': ['HS-grad'], 'education-num': [9], 
     'marital-status': ['Married-civ-spouse'], 'occupation': ['Machine-op-inspct'], 'relationship': ['Husband'], 
     'race': ['White'], 'sex': ['Male'], 'capital-gain': [0], 'capital-loss': [0], 'hours-per-week': [40], 
     'native-country': ['United-States'], 'income': [0]}
df_input = pd.DataFrame(data=d)
df_input

input_ds = df_to_dataset(df_input, shuffle=False, batch_size=batch_size)

In [22]:
print(model.predict(input_ds))

[[0.02274946]]
