# Deep and Cross Network
## Pytorch implementation based on the paper arxiv.org/abs/1708.05123

In [1]:
import yaml
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import torch.optim as optim
from torch import distributions
torch.set_num_threads(4)
import torch.onnx
import numpy as np
from sklearn.metrics import roc_auc_score
import os
import time
from torchvision.transforms import *
from tqdm import tqdm, tqdm_notebook, tnrange
import matplotlib.pyplot as plt

SEED = 123
np.random.seed(SEED)
torch.manual_seed(SEED)

%matplotlib inline

In [2]:
torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("using %s"%device)
print("number of cpu threads: %d"%torch.get_num_threads())

using cpu
number of cpu threads: 4


In [3]:
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.load(ymlfile)
    filename = cfg['filename']
    x_col_cat = cfg['features']['categorical']
    x_col_bin = cfg['features']['binary']
    x_col_num = cfg['features']['numerical']
    x_col_cyc = cfg['features']['cyclical']
    y_col = cfg['features']['label']
    w_col = cfg['features']['weight']

In [4]:
nrows=1e7
df = pd.read_csv(filename, nrows=nrows)
print("dataset size: %s"%str(df.shape))
print("ctr = %.5f"%((df[y_col]*df[w_col]).sum()*1./df[w_col].sum()))

dataset size: (602733, 30)
ctr = 0.00952


In [5]:
def merge_columns(arr):
    arr = arr.astype(str)
    vals = (arr[:,0].astype(object)+'_'+arr[:,1].astype(object))
    for i in range(2, arr.shape[1]):
        vals = (vals+'_'+arr[:,i].astype(object))
    return vals

In [6]:
cat_features = []
for f in x_col_cat:
    if(isinstance(f, str)==False):
        df[list(f.values())[0]] = df[list(f.values())[0]].fillna('unknown')
        df[list(f.keys())[0]] = merge_columns(df[list(f.values())[0]].values)
        cat_features.append(list(f.keys())[0])
    else:
        df[f] = df[f].fillna('unknown')
        cat_features.append(f)
print(cat_features)

['geo_state', 'geo_country', 'geo_city', 'zip_code', 'category_1', 'domain', 'app_bundle', 'publisher_id', 'device_type', 'device_os', 'device_os_version', 'device_browser', 'device_browser_version', 'carrier', 'language', 'exchange_id', 'banner_type', 'account_campaign', 'account_banner', 'account_advertiser', 'account_offer']


In [7]:
assert df[cat_features].isnull().sum().sum()==0

In [8]:
binary_features = []
for f in x_col_bin:
    uniques = df[f].unique()
    d = {uniques[0]:0, uniques[1]:1}
    df[f] = df[f].map(d)
    binary_features.append(f)
    print(f, d)
print(binary_features)

is_interstitial {'No': 0, 'Yes': 1}
is_mobile_web_optimized {'No': 0, 'Yes': 1}
is_app {'Yes': 0, 'No': 1}
['is_interstitial', 'is_mobile_web_optimized', 'is_app']


In [9]:
cyc_features = []
for f in x_col_cyc:
    nuniques = df[f].nunique()
    df[f+'_sin'] = np.sin(df[f]*(2.*np.pi/nuniques))
    df[f+'_cos'] = np.cos(df[f]*(2.*np.pi/nuniques))
    cyc_features.append(f+'_sin')
    cyc_features.append(f+'_cos')
    print(f, nuniques)
print(cyc_features)

user_day_of_week 7
user_hour 24
['user_day_of_week_sin', 'user_day_of_week_cos', 'user_hour_sin', 'user_hour_cos']


### Encoding

In [10]:
def binary_encode(s, modulo, required_bits=8):
    num = hash(s) % modulo
    return np.array(list(np.binary_repr(num).zfill(required_bits))).astype(np.int8)

def hashing_trick(s, modulo):
    num = hash(s) % modulo
    arr = np.zeros((modulo)).astype(np.int8)
    arr[num] = 1
    return arr

def get_sparse_index(s, modulo):
    return hash(s) % modulo

#### Combine columns

In [11]:
df = df[cat_features + binary_features + cyc_features + [y_col] + [w_col]]

Here we represent categorical features as sparse index. In production system this encoding can be done by maintaining a map of unique values per feature. A Bloom filter can be used to track each value on the fly by checking whether a specific value exists and update the counters accordingly

In [12]:
cat_cols = {}
column_instance_dict = {}
modulos = []
for col in tqdm(cat_features):
    modulo = df[col].nunique()
    cat_cols[col] = modulo
    modulos.append(modulo)
    column_instance_dict[col] = dict(zip(df[col].unique(), range(len(df[col].unique()))))
    df[col] = df[col].map(column_instance_dict[col])
for c in cat_features:
    print(c, cat_cols[c])

100%|██████████| 21/21 [00:02<00:00,  7.84it/s]

geo_state 642
geo_country 13
geo_city 5412
zip_code 9629
category_1 58
domain 4990
app_bundle 3570
publisher_id 2614
device_type 6
device_os 11
device_os_version 140
device_browser 27
device_browser_version 893
carrier 664
language 78
exchange_id 7
banner_type 3
account_campaign 20
account_banner 60
account_advertiser 8
account_offer 11





In [13]:
embed_dict = {}
for c in cat_cols:
    if(cat_cols[c]<=10):
        embed_dict[c] = 3
    elif(cat_cols[c]<=100):
        embed_dict[c] = 5
    else:
        embed_dict[c] = 10
embed_dict

{'geo_state': 10,
 'geo_country': 5,
 'geo_city': 10,
 'zip_code': 10,
 'category_1': 5,
 'domain': 10,
 'app_bundle': 10,
 'publisher_id': 10,
 'device_type': 3,
 'device_os': 5,
 'device_os_version': 10,
 'device_browser': 5,
 'device_browser_version': 10,
 'carrier': 10,
 'language': 5,
 'exchange_id': 3,
 'banner_type': 3,
 'account_campaign': 5,
 'account_banner': 5,
 'account_advertiser': 3,
 'account_offer': 5}

In [14]:
x_cols = [c for c in df.columns if (c != 'clicks') and (c != 'impressions')]
print("number of x columns: %d"%(len(x_cols)))

number of x columns: 28


In [15]:
x_sparse = df[x_cols[:len(cat_features)]].values
x_dense = df[x_cols[len(cat_features):]].values
x = np.concatenate([x_sparse, x_dense], axis=1)
y, w = np.array(df[y_col]), np.array(df[w_col])
print("x_dense: %s x_sparse: %s y: %s w: %s"%(x_dense.shape, x_sparse.shape, y.shape, w.shape))
del df

x_dense: (602733, 7) x_sparse: (602733, 21) y: (602733,) w: (602733,)


### Prepare data for pytorch and split

In [16]:
x = torch.FloatTensor(x)
y = torch.FloatTensor(y)
w = torch.FloatTensor(w)

In [17]:
n, m = x.shape
randvec = torch.randperm(n)
x_train, x_val = torch.split(x[randvec],(n*3)//5, 0)
y_train, y_val = torch.split(y[randvec],(n*3)//5, 0)
w_train, w_val = torch.split(w[randvec],(n*3)//5, 0)

n_val = x_val.shape[0]
randvec_val = torch.randperm(n_val)
x_val, x_test = torch.split(x_val[randvec_val],n_val//2, 0)
y_val, y_test = torch.split(y_val[randvec_val],n_val//2, 0)
w_val, w_test = torch.split(w_val[randvec_val],n_val//2, 0)

print("num obs train %d\nnum obs val %d\nnum obs test %d"%(x_train.shape[0], x_val.shape[0], x_test.shape[0]))

num obs train 361639
num obs val 120547
num obs test 120547


## Build Model

### Define Hyperparameters

In [18]:
m_embed = 10
units = 100
sparse_col = torch.LongTensor(list(range(0, len(cat_features))))
dense_col = torch.LongTensor(list(range(len(cat_features), m)))
m_sparse = sparse_col.size()[0]
m_dense = dense_col.size()[0]
print("num of sparse cols: %d\nnum of dense cols: %d"%(m_sparse, m_dense))

num of sparse cols: 21
num of dense cols: 7


### Define Network layers

In [72]:
class LambdaLayer(nn.Module):
    def __init__(self, lambd):
        super(LambdaLayer, self).__init__()
        self.lambd = lambd
    def forward(self, x, y):
        return self.lambd(x, y)
    
class fullDCN(nn.Module):
    def __init__(self, embedding_sizes, num_deep_layers, num_cross_layers):
        super().__init__()
        self.embedding_layers = nn.ModuleList()
        self.num_dls = num_deep_layers
        self.num_cls = num_cross_layers
        for embed_size in embedding_sizes:
            self.embedding_layers.append(nn.Embedding(num_embeddings=embed_size, embedding_dim=m_embed))
        cross_layer_size = m_dense+(m_embed*m_sparse)
        
         #deep network layers
        self.fc1 = nn.Linear(cross_layer_size, units, bias=True)
        self.fc_bn = nn.BatchNorm1d(units)
        self.fc2 = nn.Linear(units, units, bias=True)
        self.dropout = nn.Dropout(p=0.5)
        
        #cross network layers
        self.cross_layer = nn.Linear(cross_layer_size, cross_layer_size, bias=True)
        self.cross_layer_bn = nn.BatchNorm1d(cross_layer_size)
        
        #combination layer
        self.CombLayer = nn.Linear(units+cross_layer_size, 2, bias=False) 
        self.relu1 = nn.ReLU()
        self.relu0 = nn.ReLU()
        
        #output
        self.output = LambdaLayer(lambda x, y: x / (x + y))

    def forward(self, x):
        x_sparse = x[:,sparse_col].long()
        x_dense = x[:,dense_col]
        for sparse_feature in range(m_sparse):
            new_embed = self.embedding_layers[sparse_feature](x_sparse[:, sparse_feature])
            if(sparse_feature==0):
                x_embed = new_embed
            else:
                x_embed = torch.cat([x_embed, new_embed], dim=1)
        x0 = torch.cat([x_embed, x_dense], dim=1)
        
        x_dl = F.relu(self.dropout(self.fc_bn(self.fc1(x0))))
        for _ in range(self.num_dls):
            x_dl = self.dropout(self.fc_bn(self.fc2(x_dl)))
        
        x_cl = x0 * self.cross_layer(x0) + x0
        for _ in range(1, self.num_cls):
            x_cl = x0 * self.cross_layer(x_cl) + x_cl
        
        x_concat = torch.cat([x_dl, x_cl], dim=1)
        x_comb = self.CombLayer(x_concat)
        consentration1 = self.relu1(x_comb[:, 0]) + 1.0
        consentration0 = self.relu0(x_comb[:, 1]) + 1.0
        beta = torch.distributions.Beta(consentration1, consentration0)
        print(list(zip(consentration1.data.numpy()[inds], consentration0.data.numpy()[inds])))
        return beta.mean, beta.variance

### Define Model parameters

In [73]:
DCNnet = fullDCN(embedding_sizes=modulos, num_deep_layers=10, num_cross_layers=10).to(device)
print(DCNnet)

fullDCN(
  (embedding_layers): ModuleList(
    (0): Embedding(642, 10)
    (1): Embedding(13, 10)
    (2): Embedding(5412, 10)
    (3): Embedding(9629, 10)
    (4): Embedding(58, 10)
    (5): Embedding(4990, 10)
    (6): Embedding(3570, 10)
    (7): Embedding(2614, 10)
    (8): Embedding(6, 10)
    (9): Embedding(11, 10)
    (10): Embedding(140, 10)
    (11): Embedding(27, 10)
    (12): Embedding(893, 10)
    (13): Embedding(664, 10)
    (14): Embedding(78, 10)
    (15): Embedding(7, 10)
    (16): Embedding(3, 10)
    (17): Embedding(20, 10)
    (18): Embedding(60, 10)
    (19): Embedding(8, 10)
    (20): Embedding(11, 10)
  )
  (fc1): Linear(in_features=217, out_features=100, bias=True)
  (fc_bn): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=100, out_features=100, bias=True)
  (dropout): Dropout(p=0.5)
  (cross_layer): Linear(in_features=217, out_features=217, bias=True)
  (cross_layer_bn): BatchNorm1d(217, eps=1e-05, mom

In [74]:
criterion = nn.BCELoss()
optimizer = optim.Adam(DCNnet.parameters(), lr=.01, weight_decay=0.01)
# optimizer = torch.optim.SGD(DCNnet.parameters(), lr=0.01, momentum=1.2)
epochs = 100
batch_size = 2**17
num_batches = int(len(y_train)//batch_size)
print("Model parameters:")
print("criterion:\n%s"%str(criterion))
print("optimizer:\n%s"%str(optimizer))
print("epochs: %d"%epochs)
print("batch_size: %d"%batch_size)
print("num_batches: %d"%num_batches)

Model parameters:
criterion:
BCELoss()
optimizer:
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.01
    weight_decay: 0.01
)
epochs: 100
batch_size: 131072
num_batches: 2


## Run the model

In [75]:
train_loss = 0
val_loss = 0
scale = 10e5
mean_val_loss, val_auc, val_cal = [], [], []
print("start training")
for t in range(epochs):
    epochrandvec=torch.randperm(len(y_train))
    x_train = x_train[epochrandvec,:]
    y_train = y_train[epochrandvec]
    w_train = w_train[epochrandvec]
    for i in range(num_batches):
        start_time = time.time()
        indvec = torch.LongTensor(list(range((i)*batch_size, (i)*batch_size+batch_size)))
        x_batch = x_train[indvec]
        y_batch = y_train[indvec]
        criterion.weight = w_train[indvec]
        
        x_sparse = x_batch[:,sparse_col]
        x_dense = x_batch[:,dense_col]

        #initialize
        optimizer.zero_grad()
        
        # forward + backward + optimize
        train_means, trains_variances = DCNnet(x_batch)
        loss = criterion(train_means.squeeze(), y_batch)
        train_loss = scale*loss.item()
#         print(w_train[indvec].data.numpy()[1000:1005])
        
        #Update Network
        loss.backward()
        optimizer.step()

        #evaluation metrics
        with torch.no_grad():
            val_means, val_variances = DCNnet(x_val)
            criterion.weight = w_val
            print(w_val.data.numpy()[inds], '\n')
            val_loss += scale*criterion(val_means.squeeze(), y_val).item()
            mean_val_loss.append(val_loss/((i+1+t*num_batches)*x_val.shape[0]))
            val_auc.append(roc_auc_score(y_val.data.numpy(), val_means.data.numpy(), sample_weight=w_val))
            val_cal.append((val_means.squeeze().mul(w_val).sum() / y_val.mul(w_val).sum()).item())
        
        end_time = time.time()
        elapsed = end_time - start_time
        print("e: %2d/%2d, b: %2d/%2d, tr_loss=%.3f, val_loss=%.3f, auc=%.3f, cal=%.3f, %.2fs"%
              (t+1, epochs, i+1, num_batches, 
              train_loss/(batch_size*(i+1+t*num_batches)), mean_val_loss[-1], 
              val_auc[-1], val_cal[-1], elapsed))
    
print('Finished Training')

start training
[(7.384731, 1.0), (1.0, 11.224273), (1.0, 41.083767), (15.246966, 16.133774), (1.0, 35.50566), (1.0, 10.479702), (1.0, 14.179443), (1.0, 1.0), (33.121113, 61.21363), (31.378407, 10.507473)]
[(1.0, 124.084595), (1.0, 37.319965), (1.0, 157.60117), (1.0, 91.51368), (1.0, 88.88562), (1.0, 570.7906), (1.0, 156.11742), (1.0, 342.62518), (1.0, 79.03148), (1.0, 62.66011)]
[11. 14. 14. 22. 13. 14. 11. 28. 12. 15.] 

e:  1/100, b:  1/ 2, tr_loss=8.640, val_loss=1.072, auc=0.565, cal=5.074, 18.17s
[(1.0, 81.09891), (1.0, 55.5766), (1.0, 1.0), (1.0, 50.647568), (1.0, 31.902142), (1.0, 88.77649), (1.0, 197.05617), (1.0, 69.01075), (1.0, 58.184498), (1.0, 38.35004)]
[(1.0, 751.497), (1.0, 241.83987), (1.0, 512.72015), (1.0, 197.31166), (1.0, 612.2565), (1.0, 1538.6105), (1.0, 1015.39874), (1.0, 926.97076), (1.0, 257.8253), (1.0, 410.7271)]
[11. 14. 14. 22. 13. 14. 11. 28. 12. 15.] 

e:  1/100, b:  2/ 2, tr_loss=0.480, val_loss=0.846, auc=0.611, cal=0.588, 20.38s
[(1.0, 217.323), (1.0,

KeyboardInterrupt: 

In [70]:
inds = list(np.where(w_val>10))[0][:10]
inds

array([ 141,  352,  802, 1025, 1066, 1173, 1433, 1554, 1588, 1946])

## Calculate AUC and calibration metrics

In [None]:
for data in [[x_train, y_train, w_train, 'train'], [x_val, y_val, w_val, 'val'], [x_test, y_test, w_test, 'test']]:
    probs = DCNnet(data[0]).squeeze()
    sample_weight = data[2].data.numpy()
    print(data[3]+':')
    print("number of observations = %d"%(len(data[1].data.numpy())))
    print("\tAUC = %.4f"%(roc_auc_score(data[1].data.numpy(), probs.data.numpy(), sample_weight=sample_weight)))
    print("\tCalibration = %.4f"%((probs.mul(data[2]).sum()/data[1].mul(data[2]).sum()).item()))
    print("\tctrpm = %.2f"%(1000*data[1].mul(data[2]).sum()/data[1].mul(data[2]).shape[0]))

In [None]:
plt.figure(figsize=(12,12))
plt.subplot(2,2,1)
plt.plot(mean_val_loss, label='validation loss', marker='o')
plt.legend()
plt.subplot(2,2,2)
# plt.plot(probs.data.numpy(), y_val.data.numpy(), 'o')
v = torch.cat([probs.reshape((probs.shape[0],1)),y_test.reshape((probs.shape[0],1))], dim=1)
v, _ = torch.sort(v, dim=0, descending=True)
q = probs.shape[0]//3
p_mean = [v[l:(l+q)].mean() for l in range(0, probs.shape[0], q)]
y_test_mean = [v[l:(l+q),1].mean() for l in range(0, probs.shape[0], q)]
plt.plot(p_mean, y_test_mean, 'og', label='qq: label vs p');

plt.subplot(2,2,3)
plt.plot(val_auc, 'o-', label='auc')
plt.legend()
plt.subplot(2,2,4)
plt.plot(val_cal, 'o-', label='calibration')
plt.legend();

In [None]:
from torchviz import make_dot, make_dot_from_trace

In [None]:
with torch.onnx.set_training(model, False):
    trace, _ = torch.jit.get_trace_graph(model, args=(x_test,))
make_dot_from_trace(trace)

## Export model with onnx

In [143]:
from clipper_admin import ClipperConnection, DockerContainerManager
from clipper_admin.deployers.pytorch import deploy_pytorch_model

In [147]:
clipper_conn = ClipperConnection(DockerContainerManager())

# Connect to an already-running Clipper cluster
clipper_conn.connect()
model1 = nn.Linear(1, 1)

# Define a shift function to normalize prediction inputs
def predict(model, inputs):
    pred = model(shift(inputs))
    pred = pred.data.numpy()
    return [str(x) for x in pred]

deploy_pytorch_model(
    clipper_conn,
    name="example",
    version=1,
    input_type="doubles",
    func=predict,
    pytorch_model=model1)

18-08-25:15:26:58 INFO     [clipper_admin.py:138] Successfully connected to Clipper cluster at localhost:1337
18-08-25:15:26:58 INFO     [deployer_utils.py:44] Saving function to /tmp/clipper/tmp7hyrx534
18-08-25:15:26:58 INFO     [deployer_utils.py:54] Serialized and supplied predict function
18-08-25:15:26:58 INFO     [pytorch.py:204] Torch model saved
18-08-25:15:26:58 INFO     [pytorch.py:217] Using Python 3.6 base image
18-08-25:15:26:58 INFO     [clipper_admin.py:452] Building model Docker image with model data from /tmp/clipper/tmp7hyrx534


ClipperException: Error saving torch model: ('Connection aborted.', PermissionError(13, 'Permission denied'))

In [151]:
from tensorboardX import SummaryWriter
from torch.onnx import export, _export

# writer = SummaryWriter(comment='testing')
# writer.add_graph(DCNnet, , verbose=False)
_export(DCNnet, x_test, 'dcnet.onnx', verbose=False,)

RuntimeError: /pytorch/torch/csrc/jit/tracer.h:143: getTracingState: Assertion `var_state == state` failed.

In [149]:
print(DCNnet)

fullDCN(
  (embedding_layers): ModuleList(
    (0): Embedding(642, 10)
    (1): Embedding(13, 10)
    (2): Embedding(5412, 10)
    (3): Embedding(9629, 10)
    (4): Embedding(58, 10)
    (5): Embedding(4990, 10)
    (6): Embedding(3570, 10)
    (7): Embedding(2614, 10)
    (8): Embedding(6, 10)
    (9): Embedding(11, 10)
    (10): Embedding(140, 10)
    (11): Embedding(27, 10)
    (12): Embedding(893, 10)
    (13): Embedding(664, 10)
    (14): Embedding(78, 10)
    (15): Embedding(7, 10)
    (16): Embedding(3, 10)
    (17): Embedding(20, 10)
    (18): Embedding(60, 10)
    (19): Embedding(8, 10)
    (20): Embedding(11, 10)
  )
  (fc1): Linear(in_features=217, out_features=100, bias=True)
  (fc_bn): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=100, out_features=100, bias=True)
  (dropout): Dropout(p=0.5)
  (cross_layer): Linear(in_features=217, out_features=217, bias=True)
  (cross_layer_bn): BatchNorm1d(217, eps=1e-05, mom