In [1]:
import numpy as np
import pandas as pd
from data_reader import FeatureDictionary, DataParser

In [2]:
train_file = '../Dataset/adult.data'
test_file = '../Dataset/adult.test'

CATEGORICAL_FEATURE_KEYS = [
    'workclass',
    'education',
    'education_num',
    'marital_status',
    'occupation',
    'relationship',
    'race',
    'gender',
    'native_country',
]
NUMERIC_FEATURE_KEYS = [
    'age',
    'capital_gain',
    'capital_loss',
    'hours_per_week',
    'fnlwgt',
]

LABEL_KEY = 'label'

feature_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num',
    'marital_status', 'occupation', 'relationship', 'race', 'gender',
    'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
    'label'
]

def read_data(data_file, sep='\t'):
    df = pd.read_csv(data_file, sep=sep, header=None, encoding='utf-8')
    return df

df = read_data(train_file, sep=',')
df.columns = feature_names
label_map = {'<=50K': 0, '>50K': 1}
df['label'] = df['label'].map(label_map)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [3]:
fd = FeatureDictionary(dfTrain=df, dfTest=df, numeric_cols=NUMERIC_FEATURE_KEYS, ignore_cols=[LABEL_KEY])

In [4]:
fd.feat_dim

123

In [5]:
len(set(df[CATEGORICAL_FEATURE_KEYS].values.flatten()))

116

In [6]:
parser = DataParser(fd)
Xi, Xv, y = parser.parse(df=df, has_label=True)

In [15]:
len(Xi), len(y)
Xi[:10]

[[0, 1, 10, 11, 27, 43, 50, 65, 71, 76, 78, 79, 80, 81],
 [0, 2, 10, 11, 27, 44, 51, 66, 71, 76, 78, 79, 80, 81],
 [0, 3, 10, 12, 28, 45, 52, 65, 71, 76, 78, 79, 80, 81],
 [0, 3, 10, 13, 29, 44, 52, 66, 72, 76, 78, 79, 80, 81],
 [0, 3, 10, 11, 27, 44, 53, 67, 72, 77, 78, 79, 80, 82],
 [0, 3, 10, 14, 30, 44, 51, 67, 71, 77, 78, 79, 80, 81],
 [0, 3, 10, 15, 31, 46, 54, 65, 72, 77, 78, 79, 80, 83],
 [0, 2, 10, 12, 28, 44, 51, 66, 71, 76, 78, 79, 80, 81],
 [0, 3, 10, 14, 30, 43, 53, 65, 71, 77, 78, 79, 80, 81],
 [0, 3, 10, 11, 27, 44, 51, 66, 71, 76, 78, 79, 80, 81]]

In [9]:
import tensorflow as tf

  return f(*args, **kwds)


In [11]:
dataset = tf.data.Dataset.from_tensor_slices((Xi, Xv, y))
dataset = dataset.batch(3).repeat(1)
next_element = dataset.make_one_shot_iterator().get_next()

In [12]:
a, b, c = next_element

In [13]:
a

<tf.Tensor 'IteratorGetNext:0' shape=(?, 14) dtype=int32>

In [14]:
with tf.Session() as sess:
    print(sess.run(a))

[[ 0  1 10 11 27 43 50 65 71 76 78 79 80 81]
 [ 0  2 10 11 27 44 51 66 71 76 78 79 80 81]
 [ 0  3 10 12 28 45 52 65 71 76 78 79 80 81]]


In [19]:
with tf.variable_scope('emb', reuse=tf.AUTO_REUSE):
    embeddings = tf.get_variable('embeddings',
                                 [fd.feat_dim, 5],
                                 initializer=tf.random_normal_initializer(0.0, 0.1),
                                 dtype=tf.float32,)

tf.nn.embedding_lookup(embeddings, a)

<tf.Tensor 'embedding_lookup:0' shape=(?, 14, 5) dtype=float32>