In [187]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2
from scipy.stats import pearsonr
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [162]:
input_file = 'adult.data.txt'

In [163]:
# Define the columns names
names = ['Age', 'Work-Class', 'fnlwgt',
         'Education', 'Education-Num', 'Marital-Status',
         'Occupation', 'Relationship', 'Race',
         'Sex', 'Capital-Gain', 'Capital-Loss',
         'Hours-Per-Week', 'Native-Country', 'Earnings-Raw']
# Load dataset
adult = pd.read_csv(input_file, header=None, names=names)

In [164]:
# Preview the first five columns
adult.head()

Unnamed: 0,Age,Work-Class,fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-Per-Week,Native-Country,Earnings-Raw
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [165]:
# Remove any lines with invalid numbers
adult.dropna(how='all', inplace=True) # inplace ensures the same dataframe is affected, rather than creating a new one

In [166]:
(rows, columns) = adult.shape
print('There are {} rows and {} columns'.format(rows, columns))
adult.shape

There are 32561 rows and 15 columns


(32561, 15)

In [167]:
# Preview columns
adult.columns

Index(['Age', 'Work-Class', 'fnlwgt', 'Education', 'Education-Num',
       'Marital-Status', 'Occupation', 'Relationship', 'Race', 'Sex',
       'Capital-Gain', 'Capital-Loss', 'Hours-Per-Week', 'Native-Country',
       'Earnings-Raw'],
      dtype='object')

In [168]:
adult['Hours-Per-Week'].describe()

count    32561.000000
mean        40.437456
std         12.347429
min          1.000000
25%         40.000000
50%         40.000000
75%         45.000000
max         99.000000
Name: Hours-Per-Week, dtype: float64

In [169]:
adult['Education-Num'].median()

10.0

In [170]:
# View unique values in a column
adult['Work-Class'].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay', ' Never-worked'], dtype=object)

In [171]:
# Convert numerical features to categorical features through a process called discretization

In [172]:
adult['LongHours'] = adult['Hours-Per-Week'] > 40
adult.head()

Unnamed: 0,Age,Work-Class,fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-Per-Week,Native-Country,Earnings-Raw,LongHours
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,False
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,False
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,False
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,False
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,False


In [173]:
selected_columns = ['Age', 'Education-Num', 'Capital-Gain', 'Capital-Loss', 'Hours-Per-Week']
X = adult[selected_columns].values
X

array([[   39,    13,  2174,     0,    40],
       [   50,    13,     0,     0,    13],
       [   38,     9,     0,     0,    40],
       ..., 
       [   58,     9,     0,     0,    40],
       [   22,     9,     0,     0,    20],
       [   52,     9, 15024,     0,    40]])

In [176]:
y = (adult['Earnings-Raw'] == ' >50K').values
y

array([False, False, False, ..., False, False,  True], dtype=bool)

In [180]:
transformer = SelectKBest(score_func=chi2, k=3)
Xt_chi2 = transformer.fit_transform(X, y)
transformer.scores_

array([  8.60061182e+03,   2.40142178e+03,   8.21924671e+07,
         1.37214589e+06,   6.47640900e+03])

In [181]:
# Best features are 1, 3, and 4
# Correlates to `Age`, `Capital-Gain` and `Capital-Loss`

In [182]:
def multivariante_pearsonr(X, y):
    scores, pvalues = [], []
    for column in range(X.shape[1]):
        cur_score, cur_p = pearsonr(X[:, column], y)
        scores.append(abs(cur_score))
        pvalues.append(cur_p)
    return (np.array(scores), np.array(pvalues))

In [185]:
transformer = SelectKBest(score_func=multivariante_pearsonr, k=3)
Xt_pearson = transformer.fit_transform(X, y)
print(transformer.scores_)

[ 0.2340371   0.33515395  0.22332882  0.15052631  0.22968907]


In [188]:
# The function returns a different set of features
# Best features are the first, second and fifth column
# Correlates to `Age`, `Education-Num`, and `Hours-Per-Week`

In [189]:
clf = DecisionTreeClassifier(random_state=14)
scores_chi2 = cross_val_score(clf, Xt_chi2, y, scoring='accuracy')
scores_pearson = cross_val_score(clf, Xt_pearson, y, scoring='accuracy')

In [191]:
scores_pearson

array([ 0.76930164,  0.7694859 ,  0.77315028])

In [192]:
scores_chi2

array([ 0.82577851,  0.82992445,  0.83009306])