# Adult Census Income Binary Classification dataset

A subset of the 1994 Census database, using working adults over the age of 16 with an adjusted income index of > 100.

<b>Usage:</b> Classify people using demographics to predict whether a person earns over 50K a year.

Related Research: Kohavi, R., Becker, B., (1996). UCI Machine Learning Repository http://archive.ics.uci.edu/ml. Irvine, CA: University of California, School of Information and Computer Science

# Step 0: Importing our initial libraries

In [1]:
%pylab inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


# Step 2: Cleaning the data

In [2]:
#from sklearn.preprocessing import OneHotEncoder

exc_col = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

def read_and_clean(path, exc_col=exc_col):
    """Load dataset from path and pre-process. 
    
    Strip leading and trailing whitespace.
    
    returns a DataFrame
    """
    data = pd.read_csv(path, delimiter=' *, *', engine='python')
    data.replace({'?': 0}, inplace=True)
    data.dropna(inplace=True)
    cols = data.keys()
    
    dummied_data = pd.get_dummies(data.drop('income', 1))
    dummied_data['income'] = data['income']
    dummied_data.replace({
        '<=50K': 0,
        '>50K': 1
    }, inplace=True)
    
    # Creates a dynamic mapping of columns
#     map_data = {}

#     for col in cols:
#         if col not in exc_col:
#             map_data[col] = {}
#             for idx, val in enumerate(data[col].unique()):
#                 map_data[col][val] = idx

#             data[col] = data[col].map(map_data[col])
#     # OneHotEncoding
#     categorical_features = [data.columns.get_loc(k) for k in (set(data.keys()) - set(exc_col) - set(['income']))]
#     enc = OneHotEncoder(categorical_features=categorical_features)
#     print(categorical_features)
#     hot_out = pd.DataFrame(enc.fit_transform(data).toarray(), columns=categorical_features)
    
    return dummied_data

data = read_and_clean('Adult Census Income Binary Classification dataset.csv')
data.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_0,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,income
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Step 3: Feature Scaling

## Mean normalization

In [3]:
def do_feature_scaling(col, data):
    """Applies mean normalization to a column.
    
    Modifies data in place.
    """
    _std = np.std(data[col])
    _mean = np.mean(data[col])
    
    data[col] = (data[col] - _mean) / _std

for s in ['age', 'fnlwgt', 'education-num', 'capital-gain', 
          'capital-loss', 'hours-per-week']:
    do_feature_scaling(s, data)


In [4]:
from sklearn.decomposition import PCA

SPLIT = 7500

pca = PCA(n_components=20)
pca_out = pca.fit_transform(data.drop('income', 1))
data2 = pd.DataFrame(pca_out)
data2 = data2.set_index(data.index)
data2['income'] = data['income']

data2 = (data2[data2['income'] == 1].sample(SPLIT, random_state=0)
        .append(data2[data2['income']==0].sample(SPLIT, random_state=0)))

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA

SEED = 0
RESULTS = {}
c = np.arange(100, 1001, 100)
g = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
# Best C=1000, gamma=0.001
def run_svm(label, data, random_state):
    tuned_parameters = [
        {'kernel': ['poly'], 'degree': [2, 3, 4], 'gamma': g, 'C': c},
        {'kernel': ['sigmoid'], 'gamma': g, 'C': c},  
        {'kernel': ['rbf'], 'gamma': g, 'C': c},
        {'kernel': ['linear'], 'C': [1, 100]}, # takes SUPER long
                       ]
    
    X = data.drop(label, 1)
    y = data[label]
    (X_train, X_test, 
     y_train, y_test) = train_test_split(X, y, random_state=random_state,
                                         test_size=.20, stratify=y)
    
    clf = GridSearchCV(svm.SVC(random_state=random_state), tuned_parameters, cv=5)
    clf.fit(X_train, y_train)
    print(clf.best_params_)

    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))

    return clf

clf = run_svm('income', data2, SEED)

{'kernel': 'rbf', 'C': 900, 'gamma': 0.01}
             precision    recall  f1-score   support

          0       0.87      0.76      0.81      1500
          1       0.78      0.88      0.83      1500

avg / total       0.83      0.82      0.82      3000



 * was super slow even with just 500 samples without PCA and feature scaling
 * PCA: n_components affects the final score
 * categorical data should not be represented by numbers, should be transformed into columns with 0 or 1
 * linear took super long
 * highest f1 possible we found is .84

In [7]:
clf.cv_results_['params']
clf.cv_re

({'C': 1, 'degree': 2, 'gamma': 0.001, 'kernel': 'poly'},
 {'C': 1, 'degree': 2, 'gamma': 0.0001, 'kernel': 'poly'},
 {'C': 1, 'degree': 2, 'gamma': 1e-05, 'kernel': 'poly'},
 {'C': 1, 'degree': 3, 'gamma': 0.001, 'kernel': 'poly'},
 {'C': 1, 'degree': 3, 'gamma': 0.0001, 'kernel': 'poly'},
 {'C': 1, 'degree': 3, 'gamma': 1e-05, 'kernel': 'poly'},
 {'C': 1, 'degree': 4, 'gamma': 0.001, 'kernel': 'poly'},
 {'C': 1, 'degree': 4, 'gamma': 0.0001, 'kernel': 'poly'},
 {'C': 1, 'degree': 4, 'gamma': 1e-05, 'kernel': 'poly'},
 {'C': 10, 'degree': 2, 'gamma': 0.001, 'kernel': 'poly'},
 {'C': 10, 'degree': 2, 'gamma': 0.0001, 'kernel': 'poly'},
 {'C': 10, 'degree': 2, 'gamma': 1e-05, 'kernel': 'poly'},
 {'C': 10, 'degree': 3, 'gamma': 0.001, 'kernel': 'poly'},
 {'C': 10, 'degree': 3, 'gamma': 0.0001, 'kernel': 'poly'},
 {'C': 10, 'degree': 3, 'gamma': 1e-05, 'kernel': 'poly'},
 {'C': 10, 'degree': 4, 'gamma': 0.001, 'kernel': 'poly'},
 {'C': 10, 'degree': 4, 'gamma': 0.0001, 'kernel': 'poly'},


In [8]:
import pickle

def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

save_obj(clf, 'data')