In [118]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import os

In [119]:
path = os.path.abspath('..')
data_path = os.path.join(path, 'data', 'adult.data')
names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
         'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'target']
data = pd.read_table(data_path, sep=',', names=names)

  """


In [120]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [121]:
# 特征工程
# target
data['target'] = data['target'].apply(lambda x: 0 if x == ' <=50K' else 1)
# age
bins = [-np.inf,18, 25, 35, 45, 50, np.inf]
labels = list(range(len(bins)-1))
data['age'] = pd.cut(data['age'], bins=bins, labels=labels)

# education-num
bins = [-np.inf, 5, 10, 20, 40, np.inf]
labels = list(range(len(bins)-1))
data['education-num'] = pd.cut(data['education-num'], bins=bins, labels=labels)


# hours-per-week
bins = [-np.inf, 10, 30, 40, 70, np.inf]
labels = list(range(len(bins)-1))
data['hours-per-week'] = pd.cut(data['hours-per-week'], bins=bins, labels=labels)

In [122]:
continuous_cols = ['fnlwgt', 'capital-gain', 'capital-loss']
cat_columns = [col for col in data.columns if col not in continuous_cols+['age', 'hours-per-week', 'education-num']]
# deep_columns = ['workclass','fnlwgt', 'education', 'marital-status', 'occupation', 'relationship', 'race', 
#                 'native-country']

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
for col in cat_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    
for col in continuous_cols:
    mms = MinMaxScaler()
    data[col] = mms.fit_transform(data[col].values.reshape(-1,1)).reshape(-1)



In [123]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,3,7,0.044302,9,2,4,1,1,4,1,0.02174,0.0,2,39,0
1,4,6,0.048238,9,2,2,4,0,4,1,0.0,0.0,1,39,0
2,3,4,0.138113,11,1,0,6,1,4,1,0.0,0.0,2,39,0
3,5,4,0.151068,1,1,2,6,0,2,1,0.0,0.0,2,39,0
4,2,4,0.221488,9,2,2,10,5,2,0,0.0,0.0,2,5,0


In [124]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'target'],
      dtype='object')

In [125]:
# sparse and cross feature
from sklearn.preprocessing import PolynomialFeatures
wide_columns = ['age','workclass', 'education', 'education-num','occupation', 'relationship', 
                'hours-per-week','native-country', 'marital-status', 'sex']
data_wide = data[wide_columns]
cross_columns = [['occupation', 'sex'], ['occupation', 'education'], ['education', 'native-country'],
                 ['age', 'occupation'], ['age', 'hours-per-week'], ['sex', 'education']]
for l in cross_columns:
    poly = PolynomialFeatures()
    c = poly.fit_transform(data_wide[l])
    c = pd.DataFrame(c, columns=[l[0]+'_'+l[1]+'_{}'.format(i) for i in range(c.shape[1])])
    data_wide = pd.concat((data_wide, c), axis=1)

# onehot
for col in wide_columns:
    data_wide[col] = data_wide[col].astype('str')
data_wide = pd.get_dummies(data_wide)
data_target = data['target']

In [126]:
# 构建embedding dict
deep_columns = ['workclass', 'occupation', 'native-country', 'race', 'fnlwgt', 'capital-gain', 'capital-loss']
data_deep = data[deep_columns]
embedding_columns = ['workclass', 'occupation', 'native-country', 'race']
embedding_columns_dict = {}
for i in range(len(deep_columns)):
    if deep_columns[i] in embedding_columns:
        col_name = deep_columns[i]
        embedding_columns_dict[col_name] = (len(data_deep[col_name].unique()), 8)
deep_columns_idx = dict()
for idx, key in enumerate(data_deep.columns):
    deep_columns_idx[key] = idx

In [204]:
# train test split
from sklearn.model_selection import train_test_split
train_wide, test_wide = train_test_split(data_wide, test_size=0.4, random_state=999)
train_deep, test_deep = train_test_split(data_deep, test_size=0.4, random_state=999)
train_target, test_target = train_test_split(data_target, test_size=0.4, random_state=999)

In [129]:
import torch
import torch.nn as nn
import  torch.nn.functional as F

def linear(inp, out, dropout):
    return nn.Sequential(
        nn.Linear(inp, out),
        nn.LeakyReLU(),
        nn.Dropout(dropout)
    )

class DeepModel(nn.Module):
    def __init__(self, deep_columns_idx, embedding_columns_dict, hidden_layers, dropouts, output_dim):
        """
        :param deep_columns_dict: dict include categories columns name and number of unique val and 
                                embedding dimension  e.g. {'age':(10, 32)}
        :param hidden_layers: number of hidden layers
        :param deep_columns_idx: dict of columns name and columns index
        :param dropout: list of float each hidden layers dropout len(dropouts) == hidden_layers - 1
        """
        super(DeepModel, self).__init__()
        self.embedding_columns_dict = embedding_columns_dict
        self.deep_columns_idx = deep_columns_idx
        for key, val in embedding_columns_dict.items():
            setattr(self, 'dense_col_'+key, nn.Embedding(val[0], val[1]))
        embedding_layer = 0
        for col in self.deep_columns_idx.keys():
            if col in embedding_columns_dict:
                embedding_layer += embedding_columns_dict[col][1]
            else:
                embedding_layer += 1
        self.layers = nn.Sequential()
        hidden_layers = [embedding_layer] + hidden_layers
        dropouts = [0.0] + dropouts
        for i in range(1, len(hidden_layers)):
            self.layers.add_module(
                'hidden_layer_{}'.format(i-1),
                linear(hidden_layers[i-1], hidden_layers[i], dropouts[i-1])
            )
        self.layers.add_module('last_linear', nn.Linear(hidden_layers[-1], output_dim))
    
    def forward(self, x):
        emb = []
        continuous_cols = [col for col in self.deep_columns_idx.keys() if col not in self.embedding_columns_dict]
        for col, _ in self.embedding_columns_dict.items():
            if col not in self.deep_columns_idx:
                raise ValueError("ERROR column name may be your deep_columns_idx dict is not math the"
                                 "embedding_columns_dict")
            else:
                idx = self.deep_columns_idx[col]
                emb.append(getattr(self, 'dense_col_'+col)(x[:, idx].long()))

        for col in continuous_cols:
            idx = self.deep_columns_idx[col]
            emb.append(x[:, idx].view(-1, 1))
        embedding_layers = torch.cat(emb, dim=1)
        out = self.layers(embedding_layers)
        return out

In [130]:
class WideModel(nn.Module):
    def __init__(self, input_dim, output_dim, dropout=0):
        """
        wide model using LR
        :param input_dim: int the dimension of wide model input
        :param output_dim: int the dimension of wide model output
        """
        super(WideModel, self).__init__()
        self.linear = linear(input_dim, output_dim, dropout)

    def forward(self, x):
        out = self.linear(x)
        return out

In [131]:
class WideDeep(nn.Module):
    def __init__(self, wide_model_params,
                 deep_model_params, activation):
        """
        wide deep model
        :param wide_columns_idx:
        :param deep_columns_idx:
        :param activation:
        """
        super(WideDeep, self).__init__()
        self.activation = self.set_activation(activation)
#         self.wide_model_params = wide_model_params
#         self.deep_model_params = deep_model_params
        
        # wide model parameters
        wide_input_dim = wide_model_params['wide_input_dim']
        wide_output_dim = wide_model_params['wide_output_dim']
        self.wide = WideModel(wide_input_dim, wide_output_dim)
        
        # deep model parameters
        deep_columns_idx = deep_model_params['deep_columns_idx']
        embedding_columns_dict = deep_model_params['embedding_columns_dict']
        hidden_layers = deep_model_params['hidden_layers']
        dropouts = deep_model_params['dropouts']
        deep_output_dim = deep_model_params['deep_output_dim']
        self.deep = DeepModel(deep_columns_idx=deep_columns_idx,
                         embedding_columns_dict=embedding_columns_dict,
                         hidden_layers=hidden_layers,
                         dropouts=dropouts,
                         output_dim=deep_output_dim)
#         self.param = {'wide':self.wide.parameters(), 'deep':self.deep.parameters()}

    def set_activation(self, activation):
        assert activation in [None, 'sigmoid', 'softmax']
        if activation == 'sigmoid':     # for binary classification
            return F.sigmoid
        elif activation == 'softmax':   # for multiple classification
            return F.softmax
        else:
            return None

    def forward(self, x):
        """
        input and forward
        :param x: tuple(wide_model_data, deep_model_data)
        :return:
        """
        # wide model
        wide_data = x[0]
        wide_out = self.wide(wide_data)

        # deep model
        deep_data = x[1]
        deep_out = self.deep(deep_data)

        assert wide_out.size() == deep_out.size()
        wide_deep = wide_out.add(deep_out)
        if not self.activation:
            return wide_deep
        elif self.activation == F.softmax:
            out = self.activation(wide_deep, dim=1)
        else:
            out = self.activation(wide_deep)
        return out

In [225]:
from sklearn.metrics import accuracy_score
def valid_epoch(model, valid_loader, epoch):
    model.eval()
    losses = []
    targets = []
    outs = []
    for idx, (data_wide, data_deep, target) in enumerate(valid_loader):
#         data, target = data.to(device), target.to(device)
        x = (data_wide, data_deep)
        out = model(x)
        loss = criterion(target, out)
        losses.append(loss.item())
        targets += list(target.numpy())
        out = out.view(-1).detach().numpy()
        outs += list(np.int64(out>0.5))
    met = accuracy_score(targets, outs)
    return met, sum(losses)/len(losses)


def train_epoch(model, train_loader, test_loader,optimizer, epoch, validation=True):
    model.train()
    for idx, (data_wide, data_deep, target) in enumerate(train_loader):
#         data_wide, data_deep, target = data.to(device), target.to(device)
        x = (data_wide, data_deep)
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(target, out)
        loss.backward()
        optimizer.step()
        
        if (idx+1) % 400 == 0:
            print("Epoch %d iteration %d loss is %.4f" %(epoch+1, idx+1, loss.item()))
        if idx == len(train_loader):
            break
    
    if validation:
        met, loss = valid_epoch(model, test_loader, epoch)
        print("Epoch %d validation loss is %.4f and validation metrics is %.4f" %(epoch, loss, met))


def train(model, train_loader, test_loader, epochs, optimzers, validation):
    for epoch in range(epochs):
        train_epoch(model, train_loader, test_loader,optimzers, epoch, validation)

        
# save model的问题，保存整个模型及参数，如果模型很大加载的时候会很慢，不适用与在线预测，在线预测一般是只将模型参数保存及model.state_dict
# 如果实现的是模型的继续训练，则需要同时保存优化器和当前训练的epoch，不保存epoch的话每次训练依然是从第epoch=0开始训练
# 在线预测的时候需要现定义好网络结构，然后load网络参数进行预测即可
def save_model(model, path):
    torch.save(model.state_dict(), path)

# 参数model是事先定义好的网络模型
def load_model(model, path):
    model.load_state_dict(torch.load(path))

In [226]:
from torch.utils.data import Dataset, DataLoader
class trainset(Dataset):
    def __init__(self, data):
        self.wide_data = data[0]
        self.deep_data = data[1]
        self.target = data[2]
        
    def __getitem__(self, index):
        wide_data = self.wide_data[index]
        deep_data = self.deep_data[index]
        target = self.target[index]
        return (wide_data, deep_data, target)
    
    def __len__(self):
        return len(self.target)

In [227]:
class MultipleOptimizer():
    def __init__(self, opts):
        self.optimizers = opts
        
    def zero_grad(self):
        for opt in self.optimizers:
            opt.zero_grad()
            
    def step(self):
        for opt in self.optimizers:
            opt.step()

In [228]:
x = (torch.Tensor(train_wide.values), torch.Tensor(train_deep.values), torch.Tensor(train_target.values))
train_data = trainset(x)
x1 = (torch.Tensor(test_wide.values), torch.Tensor(test_deep.values), torch.Tensor(test_target.values))
test_data = trainset(x1)
trainloader = DataLoader(train_data, batch_size=32, shuffle=True)
testloader = DataLoader(test_data, batch_size=32, shuffle=False)

In [229]:
deep_model_params = {
    'deep_columns_idx': deep_columns_idx,
    'embedding_columns_dict': embedding_columns_dict, 
    'hidden_layers':[64, 32, 16],
    'dropouts':[0.5, 0.5],
    'deep_output_dim':1}
wide_model_params = {
    'wide_input_dim':data_wide.shape[1],
    'wide_output_dim':1
}
activation = 'sigmoid'

In [230]:
widedeep = WideDeep(wide_model_params, deep_model_params, activation)

In [231]:
def criterion(target, out):
    return F.binary_cross_entropy(out, target.view(-1,1))

In [234]:
optimizer = torch.optim.Adam(widedeep.parameters(), lr = 0.01)
train(widedeep,  trainloader, testloader,20, optimizer, True)

Epoch 1 iteration 400 loss is 0.5278
Epoch 0 validation loss is 0.3735 and validation metrics is 0.8210
Epoch 2 iteration 400 loss is 0.4419
Epoch 1 validation loss is 0.3707 and validation metrics is 0.8369
Epoch 3 iteration 400 loss is 0.5276
Epoch 2 validation loss is 0.3675 and validation metrics is 0.8383
Epoch 4 iteration 400 loss is 0.3358
Epoch 3 validation loss is 0.3595 and validation metrics is 0.8364
Epoch 5 iteration 400 loss is 0.3180
Epoch 4 validation loss is 0.3474 and validation metrics is 0.8447
Epoch 6 iteration 400 loss is 0.3447
Epoch 5 validation loss is 0.3497 and validation metrics is 0.8428
Epoch 7 iteration 400 loss is 0.2919
Epoch 6 validation loss is 0.3510 and validation metrics is 0.8385
Epoch 8 iteration 400 loss is 0.3935
Epoch 7 validation loss is 0.3489 and validation metrics is 0.8431
Epoch 9 iteration 400 loss is 0.2405
Epoch 8 validation loss is 0.3497 and validation metrics is 0.8479
Epoch 10 iteration 400 loss is 0.2913
Epoch 9 validation loss is

In [233]:
np.where(data['target'].values==1)

(array([    7,     8,     9, ..., 32554, 32557, 32560]),)

In [19]:
# race
data['race'].unique()

array([' White', ' Black', ' Asian-Pac-Islander', ' Amer-Indian-Eskimo',
       ' Other'], dtype=object)

In [21]:
# relationship
data['relationship'].unique()

array([' Not-in-family', ' Husband', ' Wife', ' Own-child', ' Unmarried',
       ' Other-relative'], dtype=object)

In [20]:
# sex
data['sex'].unique()

array([' Male', ' Female'], dtype=object)