### Getting and preprocessing data

In [1]:
%qtconsole

In [2]:
import numpy as np
import pandas as pd

In [3]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)
test_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test',
                      skiprows = 1, header = None) # Make sure to skip a row for the test set


In [8]:
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 
              'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
             'wage_class']


In [9]:
train_set.columns = col_labels
test_set.columns = col_labels

#### check whether there are missing values

In [12]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education_num     32561 non-null int64
marital_status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital_gain      32561 non-null int64
capital_loss      32561 non-null int64
hours_per_week    32561 non-null int64
native_country    32561 non-null object
wage_class        32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [13]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 15 columns):
age               16281 non-null int64
workclass         16281 non-null object
fnlwgt            16281 non-null int64
education         16281 non-null object
education_num     16281 non-null int64
marital_status    16281 non-null object
occupation        16281 non-null object
relationship      16281 non-null object
race              16281 non-null object
sex               16281 non-null object
capital_gain      16281 non-null int64
capital_loss      16281 non-null int64
hours_per_week    16281 non-null int64
native_country    16281 non-null object
wage_class        16281 non-null object
dtypes: int64(6), object(9)
memory usage: 1.9+ MB


- this just mean there is no np.nan
- according to documentation, uknown is represented as ' ?'
- simple test: check the size after dropping all with ' ?'

In [24]:
print train_set.replace(' ?', np.nan).dropna().shape
print test_set.replace(' ?', np.nan).dropna().shape

(30162, 15)
(15060, 15)


In [25]:
train_nomissing = train_set.replace(' ?', np.nan).dropna()
test_nomissing = test_set.replace(' ?', np.nan).dropna()

### remove the trailing '.' in '<=50K.' in test_nomissing.wage_class

In [35]:
test_nomissing['wage_class'] = test_nomissing['wage_class'].replace({' <=50K.': ' <=50K', ' >50K.': ' >50K'})

### How to treat categorical variables?
- theoretically should use one-hot encoding
- in practice, for tree algorithm ordinarl encoding is often sufficient (** use pd.Categorical().codes **)

In [37]:
combined_set = pd.concat([train_nomissing, test_nomissing], axis = 0)

In [38]:
for feature in combined_set.columns:
    if combined_set[feature].dtype == 'object':
        combined_set[feature] = pd.Categorical(combined_set[feature]).codes 

In [42]:
final_train = combined_set[:train_nomissing.shape[0]]
final_test = combined_set[train_nomissing.shape[0]:]

### Initial Model Set up and Grid Search

In [43]:
y_train = final_train.pop('wage_class')
y_test = final_test.pop('wage_class')

In [46]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV 

In [48]:
cv_params = {'max_depth': [3, 5, 7], 'min_child_weight': [1, 3, 5]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed': 0, 'subsample': 0.8,
             'colsample_bytree': 0.8, 'objective': 'binary:logistic'}

In [49]:
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params),
                            cv_params,
                            scoring = 'accuracy',
                            cv = 5, 
                            n_jobs = -1)

#### Run grid search with 5-fold cross-validation

In [50]:
optimized_GBM.fit(final_train, y_train)

KeyboardInterrupt: 