# Income Classification using a Linear Classifier

Modified from original code here: https://www.tensorflow.org/tutorials/wide

### Make the notebook compatible with both Python 2 and 3

http://python-future.org/compatible_idioms.html

In [52]:
from __future__ import absolute_import, division, print_function

In [53]:
import pandas as pd
from six.moves import urllib
import shutil
import tensorflow as tf
tf.compat.v1.disable_eager_execution()

In [54]:
print(tf.__version__)
print(pd.__version__)

2.3.0
1.0.5


### Set up the file names where the training data and the test data are to be stored

Note that you'll have to manually create the "census" directory under the current working directory

In [55]:
TRAIN_FILE_NAME = "Datasets/adult.data"
TEST_FILE_NAME = "Datasets/adult.test"

### Download and store the training and test data from the UCI Machine Learning Repository

There are a whole host of interesting datasets here: https://archive.ics.uci.edu/ml/index.php

In [56]:
urllib.request.urlretrieve(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
        TRAIN_FILE_NAME)

('Datasets/adult.data', <http.client.HTTPMessage at 0x1b7213e8cd0>)

In [57]:
urllib.request.urlretrieve(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
        TEST_FILE_NAME)

('Datasets/adult.test', <http.client.HTTPMessage at 0x1b7213e8eb0>)

### The columns in the census data

In [58]:
CSV_COLUMNS = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "gender",
    "capital_gain", "capital_loss", "hours_per_week", "native_country",
    "income_bracket"
]

### Read training data into a dataframe

Sample and explore the data to understand what information is available. This will also be used to set up feature columns which will serve as an input to our linear classifier.

In [59]:
df = pd.read_csv(
      TRAIN_FILE_NAME,
      names=CSV_COLUMNS,
      skipinitialspace=True,
      skiprows=1)

In [60]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


### Choose only those columns which seem relevant to predicting income

* Removed the "fnlwgt" column, the number of people the census takers believe that observation represents (sample weight)
* Removed "capital_gain" and "capital_loss", continuous, dense columns usually work well with neural networks

In [61]:
TRIMMED_REORDERED_COLUMNS = [
    "age", "workclass", "education", "education_num",
    "marital_status", "relationship", "race", "gender", "occupation", 
    "hours_per_week", "native_country", "income_bracket"
]

In [62]:
df = df[TRIMMED_REORDERED_COLUMNS]

In [63]:
df.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,relationship,race,gender,occupation,hours_per_week,native_country,income_bracket
0,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Husband,White,Male,Exec-managerial,13,United-States,<=50K
1,38,Private,HS-grad,9,Divorced,Not-in-family,White,Male,Handlers-cleaners,40,United-States,<=50K
2,53,Private,11th,7,Married-civ-spouse,Husband,Black,Male,Handlers-cleaners,40,United-States,<=50K
3,28,Private,Bachelors,13,Married-civ-spouse,Wife,Black,Female,Prof-specialty,40,Cuba,<=50K
4,37,Private,Masters,14,Married-civ-spouse,Wife,White,Female,Exec-managerial,40,United-States,<=50K


### Feature columns with categorical values

Find the unique values in a column and set up a categorical feature for those columns

In [64]:
df['gender'].unique()

array(['Male', 'Female'], dtype=object)

In [65]:
df['race'].unique()

array(['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo',
       'Other'], dtype=object)

In [66]:
df['education'].unique()

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
       '5th-6th', '10th', '1st-4th', 'Preschool', '12th'], dtype=object)

In [67]:
df['marital_status'].unique()

array(['Married-civ-spouse', 'Divorced', 'Married-spouse-absent',
       'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'],
      dtype=object)

In [68]:
df['relationship'].unique()

array(['Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried',
       'Other-relative'], dtype=object)

In [69]:
df['workclass'].unique()

array(['Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov',
       'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object)

### Set up categorical feature columns

Use *tf.feature_column.categorical_column_with_vocabulary_list* if the categorical columns have a finite set of values that we know in advance

In [70]:
gender = tf.feature_column.categorical_column_with_vocabulary_list(
    "gender", ["Female", "Male"])

race = tf.feature_column.categorical_column_with_vocabulary_list(
    "race", ['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other'])

education = tf.feature_column.categorical_column_with_vocabulary_list(
    "education", [
        "Bachelors", "HS-grad", "11th", "Masters", "9th",
        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
        "Preschool", "12th"
    ])

marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
    "marital_status", [
        "Married-civ-spouse", "Divorced", "Married-spouse-absent",
        "Never-married", "Separated", "Married-AF-spouse", "Widowed"
    ])

relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    "relationship", [
        "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
        "Other-relative"
    ])

workclass = tf.feature_column.categorical_column_with_vocabulary_list(
    "workclass", [
        "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
        "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
    ])

### Columns with continuous values

Use *tf.feature_column.numeric_column* to set up columns which have values in a numeric range

In [71]:
age = tf.feature_column.numeric_column("age")

education_num = tf.feature_column.numeric_column("education_num")

hours_per_week = tf.feature_column.numeric_column("hours_per_week")

### Bucketed columns

Sometimes the relationship between a continuous feature and the label is not linear. A person's income may grow with age in the early stage of one's career, then the growth may slow at some point, and finally the income decreases after retirement. In this scenario, using the **raw age as a real-valued feature column** might not be a good choice because the model can only learn one of the three cases:

* Income always increases at some rate as age grows (positive correlation),
* Income always decreases at some rate as age grows (negative correlation), or
* Income stays the same no matter at what age (no correlation)

If we want to **learn the fine-grained correlation** between income and each age group separately, we can leverage bucketization. 

Bucketization is a process of dividing the entire range of a continuous feature into a set of consecutive bins/buckets, and then converting the original numerical feature into a bucket ID (as a categorical feature) depending on which bucket that value falls into. 

In [72]:
age_buckets = tf.feature_column.bucketized_column(
    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

### Categorical column values might change over time

In [73]:
df['occupation'].unique()

array(['Exec-managerial', 'Handlers-cleaners', 'Prof-specialty',
       'Other-service', 'Adm-clerical', 'Sales', 'Craft-repair',
       'Transport-moving', 'Farming-fishing', 'Machine-op-inspct',
       'Tech-support', '?', 'Protective-serv', 'Armed-Forces',
       'Priv-house-serv'], dtype=object)

In [74]:
df['native_country'].unique()

array(['United-States', 'Cuba', 'Jamaica', 'India', '?', 'Mexico',
       'South', 'Puerto-Rico', 'Honduras', 'England', 'Canada', 'Germany',
       'Iran', 'Philippines', 'Italy', 'Poland', 'Columbia', 'Cambodia',
       'Thailand', 'Ecuador', 'Laos', 'Taiwan', 'Haiti', 'Portugal',
       'Dominican-Republic', 'El-Salvador', 'France', 'Guatemala',
       'China', 'Japan', 'Yugoslavia', 'Peru',
       'Outlying-US(Guam-USVI-etc)', 'Scotland', 'Trinadad&Tobago',
       'Greece', 'Nicaragua', 'Vietnam', 'Hong', 'Ireland', 'Hungary',
       'Holand-Netherlands'], dtype=object)

### Categorical columns with unknown values

If you don't know the list of categorical columns in advance then we use *tf.feature_column.categorical_column_with_hash_bucket* where every column value will be hashed to a unique integer.

The chances of collisions are usually low.

In [28]:
occupation = tf.feature_column.categorical_column_with_hash_bucket(
    "occupation", hash_bucket_size=1000)

native_country = tf.feature_column.categorical_column_with_hash_bucket(
    "native_country", hash_bucket_size=1000)

### Base columns which use the raw values from the dataset

In [29]:
base_columns = [
    gender, race, marital_status, workclass, occupation,
    native_country, age_buckets, education
]

### Crossed columns express more complex relationships between data

Some relationships between individual features and the output maybe hard to define. Two or more features considered together might have a more direct impact on the output. Feature crosses are **engineered features** which allows you to specify this more complex relationship.

Education and occupation when considered together will be a better predictor of income than either of them alone.

In [45]:
crossed_columns = [
    tf.feature_column.crossed_column(
        ["education", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        [age_buckets, "education", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        ["native_country", "occupation"], hash_bucket_size=1000)
]

### Continuous valued columns

If these columns are dense they are more suitable for deep neural networks

In [46]:
deep_columns = [
    education_num,
    hours_per_week
]

### The input function in an estimator maps the features and the corresponding labels

The standard library method *tf.estimator.inputs.pandas_input_fn* allows us to specify feature data as a pandas dataframe and the labels as a list.

The input function specifies the features and labels for training
    

In [47]:
def input_fn(file_name, num_epochs, shuffle):
  df = pd.read_csv(
      file_name,
      names=CSV_COLUMNS,
      skipinitialspace=True,
      skiprows=1)
  df = df[TRIMMED_REORDERED_COLUMNS]  
  
  # Remove NaN elements
  df = df.dropna(how="any", axis=0)

  # Use numeric labels to represent incomes below and above 50K
  labels = df["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
  
  return tf.compat.v1.estimator.inputs.pandas_input_fn(
      x=df,
      y=labels,
      batch_size=100,
      num_epochs=num_epochs,
      shuffle=shuffle,
      num_threads=5)

In [77]:
MODEL_DIR = "linear_classifier"

### Remove the old saved model so we generate entirely new parameters

In [78]:
shutil.rmtree(MODEL_DIR)

### Pass in all 3 sets of columns

For this model, the base columns are the ones which really affect the output

In [79]:
linear_estimator = tf.compat.v1.estimator.LinearClassifier(
        model_dir=MODEL_DIR, feature_columns=base_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'linear_classifier', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [80]:
linear_estimator.train(
      input_fn=input_fn(TRAIN_FILE_NAME, num_epochs=None, shuffle=True),
      steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into linear_classifier\model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 69.31472, step = 0
INFO:tensorflow:global_step/sec: 246.358
INFO:tensorflow:loss = 41.09048, step = 100 (0.407 sec)
INFO:tensorflow:global_step/sec: 328.944
INFO:tensorflow:loss = 24.36106, step = 200 (0.305 sec)
INFO:tensorflow:global_step/sec: 266.309
INFO:tensorflow:loss = 35.680206, step = 300 (0.378 sec)
INFO:tensorflow:global_step/sec: 354.214
INFO:tensorflow:loss = 31.796843, step = 400 (0.278 sec)
INFO:tensorflow:global_step/sec: 352.421
INFO:tensorflow:loss = 32.704784, step = 500 (0.284 sec)
INFO:ten

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifier at 0x1b71eb21c40>

### Evaluate the test data and predict the income levels of the adults

In [81]:
results = linear_estimator.evaluate(
      input_fn=input_fn(TEST_FILE_NAME, num_epochs=1, shuffle=False),
      steps=None)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-09-25T12:33:21Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from linear_classifier\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 4.40401s
INFO:tensorflow:Finished evaluation at 2020-09-25-12:33:26
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.83717215, accuracy_baseline = 0.76377374, auc = 0.8855766, auc_precision_recall = 0.69465894, average_loss = 0.34973925, global_step = 1000, label/mean = 0.23622628, loss = 34.93316, precision = 0.6909556, prediction/mean = 0.24558088, recall = 0.5621425
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: linear_classifier\model.ckpt-1000


In [82]:
for key in sorted(results):
    print("%s: %s" % (key, results[key]))

accuracy: 0.83717215
accuracy_baseline: 0.76377374
auc: 0.8855766
auc_precision_recall: 0.69465894
average_loss: 0.34973925
global_step: 1000
label/mean: 0.23622628
loss: 34.93316
precision: 0.6909556
prediction/mean: 0.24558088
recall: 0.5621425
