In [1]:
import pandas as pd
import numpy as np
from __future__ import print_function

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
df = pd.read_csv('data/adult.csv')
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
df.income.value_counts()

<=50K    37155
>50K     11687
Name: income, dtype: int64

In [5]:
df["income_label"] = (df["income"].apply(lambda x: ">50K" in x)).astype(int)

#### Define the "Wide" and "Deep" side of columns
Wide Side: wide columns & cross columns
Deep Side: embedding columns & continuous columns

In [6]:
# Wide Side
wide_cols = ['age', 'hours-per-week', 'education', 'relationship', 'workclass', 'occupation', 'native-country', 'gender']
crossed_cols = (['education', 'occupation'], ['native-country', 'occupation'])

# Deep Side
# embeddings_cols is a list of tuple which contains the name of ftr and the dimensions of embedding.
# You need give every embedding ftr col a dimension or None of them such that they will be given a default dimension.
embeddings_cols = [('education', 10), ('relationship', 8), ('workclass', 10), ('occupation', 10), ('native-country', 12)]
continuous_cols = ['age', 'hours-per-week']

target = 'income_label'
method = 'logistic'

#### 演示 utils/data_utils.py 中prepare_data() 的流程

将embedding_col中的dimension和name分离，并将其与连续性特征名称合并，组成deep_cols.

In [7]:
if type(embeddings_cols[0]) is tuple:
    emb_dim = dict(embeddings_cols)
    embeddings_cols = [emb[0] for emb in embeddings_cols]
else:
    emb_dim = {e:888 for e in embeddings_cols}
deep_cols = embeddings_cols+continuous_cols

#### Cross-product for binary ftrs

In [8]:
Y = np.array(df[target])
# We copy the original dataset so we do not mutate it
df_tmp = df.copy()[list(set(wide_cols + deep_cols))]

# Build the crossed columns
crossed_columns = []
for cols in crossed_cols:
    colname = '_'.join(cols)
    df_tmp[colname] = df_tmp[cols].apply(lambda x: '-'.join(x), axis=1)
    crossed_columns.append(colname)

# Extract the categorical column names that can be one hot encoded later
categorical_columns = list(df_tmp.select_dtypes(include=['object']).columns)

In [9]:
categorical_columns

['occupation',
 'native-country',
 'gender',
 'workclass',
 'education',
 'relationship',
 'education_occupation',
 'native-country_occupation']

In [10]:
crossed_columns

['education_occupation', 'native-country_occupation']

In [11]:
df_tmp['education_occupation'].head()

0            11th-Machine-op-inspct
1           HS-grad-Farming-fishing
2        Assoc-acdm-Protective-serv
3    Some-college-Machine-op-inspct
4                    Some-college-?
Name: education_occupation, dtype: object

In [12]:
from utils.data_utils import label_encode

# Encode the dataframe and get the encoding Dictionary only for the
# deep_cols (for the wide_cols is uneccessary)
encoding_dict,df_tmp = label_encode(df_tmp)
encoding_dict = {k:encoding_dict[k] for k in encoding_dict if k in deep_cols}
embeddings_input = []
for k,v in encoding_dict.items():
    embeddings_input.append((k, len(v), emb_dim[k]))

In [13]:
encoding_dict

{'occupation': {'Machine-op-inspct': 0,
  'Farming-fishing': 1,
  'Protective-serv': 2,
  '?': 3,
  'Other-service': 4,
  'Prof-specialty': 5,
  'Craft-repair': 6,
  'Adm-clerical': 7,
  'Exec-managerial': 8,
  'Tech-support': 9,
  'Sales': 10,
  'Priv-house-serv': 11,
  'Transport-moving': 12,
  'Handlers-cleaners': 13,
  'Armed-Forces': 14},
 'native-country': {'United-States': 0,
  '?': 1,
  'Peru': 2,
  'Guatemala': 3,
  'Mexico': 4,
  'Dominican-Republic': 5,
  'Ireland': 6,
  'Germany': 7,
  'Philippines': 8,
  'Thailand': 9,
  'Haiti': 10,
  'El-Salvador': 11,
  'Puerto-Rico': 12,
  'Vietnam': 13,
  'South': 14,
  'Columbia': 15,
  'Japan': 16,
  'India': 17,
  'Cambodia': 18,
  'Poland': 19,
  'Laos': 20,
  'England': 21,
  'Cuba': 22,
  'Taiwan': 23,
  'Italy': 24,
  'Canada': 25,
  'Portugal': 26,
  'China': 27,
  'Nicaragua': 28,
  'Honduras': 29,
  'Iran': 30,
  'Scotland': 31,
  'Jamaica': 32,
  'Ecuador': 33,
  'Yugoslavia': 34,
  'Hungary': 35,
  'Hong': 36,
  'Greece': 

In [14]:
embeddings_input

[('occupation', 15, 10),
 ('native-country', 42, 12),
 ('workclass', 9, 10),
 ('education', 16, 10),
 ('relationship', 6, 8)]

In [None]:
# select the deep_cols and get the column index that will be use later
# to slice the tensors
df_deep = df_tmp[deep_cols]
deep_column_idx = {k:v for v,k in enumerate(df_deep.columns)}

# The continous columns will be concatenated with the embeddings, so you
# probably want to normalize them first
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
for cc in continuous_cols:
    df_deep[cc]  = scaler.fit_transform(df_deep[cc].values.reshape(-1,1))

# select the wide_cols and one-hot encode those that are categorical
df_wide = df_tmp[wide_cols+crossed_columns]
del(df_tmp)
dummy_cols = [c for c in wide_cols+crossed_columns if c in categorical_columns]
df_wide = pd.get_dummies(df_wide, columns=dummy_cols)

In [29]:
df_deep
df_wide

Unnamed: 0,education,relationship,workclass,occupation,native-country,age,hours-per-week
0,0,0,0,0,0,-0.995129,-0.034087
1,1,1,0,1,0,-0.046942,0.772930
2,2,1,1,2,0,-0.776316,-0.034087
3,3,1,0,0,0,0.390683,-0.034087
4,3,0,2,3,0,-1.505691,-0.841104
...,...,...,...,...,...,...,...
48837,2,4,0,9,0,-0.849254,-0.195490
48838,1,1,0,0,0,0.098933,-0.034087
48839,1,3,0,7,0,1.411808,-0.034087
48840,1,0,0,7,0,-1.213941,-1.648120


Unnamed: 0,age,hours-per-week,education_0,education_1,education_2,education_3,education_4,education_5,education_6,education_7,...,native-country_occupation_471,native-country_occupation_472,native-country_occupation_473,native-country_occupation_474,native-country_occupation_475,native-country_occupation_476,native-country_occupation_477,native-country_occupation_478,native-country_occupation_479,native-country_occupation_480
0,25,40,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,38,50,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28,40,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,44,40,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,18,30,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,38,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48838,40,40,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48839,58,40,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48840,22,20,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Train/Test split and build the output dict

In [30]:
from sklearn.model_selection import train_test_split
from collections import namedtuple

seed = 1981
X_train_deep, X_test_deep = train_test_split(df_deep.values, test_size=0.3, random_state=seed)
X_train_wide, X_test_wide = train_test_split(df_wide.values, test_size=0.3, random_state=seed)
y_train, y_test = train_test_split(Y, test_size=0.3, random_state=seed)

# Building the output dictionary
wd_dataset = dict()
train_dataset = namedtuple('train_dataset', 'wide, deep, labels')
test_dataset  = namedtuple('test_dataset' , 'wide, deep, labels')
wd_dataset['train_dataset'] = train_dataset(X_train_wide, X_train_deep, y_train)
wd_dataset['test_dataset']  = test_dataset(X_test_wide, X_test_deep, y_test)
wd_dataset['embeddings_input']  = embeddings_input
wd_dataset['deep_column_idx'] = deep_column_idx
wd_dataset['encoding_dict'] = encoding_dict