### Using the built model

#### 0.Prepare the data

In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np

df = pd.read_csv('data/adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


#### 1.Logistic regression with varying embedding dimensions, no dropout and Adam optimizer.

In [2]:
# define a target for logistic regression:
df['income_label'] = (df['income'].apply(lambda x: ">50k" in x)).astype(int)

# experiment set up
wide_cols = ['age','hours-per-week','education', 'relationship','workclass',
             'occupation','native-country','gender']
crossed_cols = (['education', 'occupation'], ['native-country', 'occupation'])
embeddings_cols = [('education',10), ('relationship',8), ('workclass',10),
                   ('occupation',10),('native-country',10)]
continuous_cols = ["age","hours-per-week"]
target = 'income_label'
method = 'logistic'

In [3]:
from utils.data_utils import prepare_data

wd_dataset = prepare_data(df, wide_cols, crossed_cols, embeddings_cols, continuous_cols, target, scale=True)

In [4]:
# build model
# Network set up
wide_dim = wd_dataset['train_dataset'].wide.shape[1]
n_class=1 # for logistic and regression
deep_column_idx = wd_dataset['deep_column_idx']
embeddings_input= wd_dataset['embeddings_input']
encoding_dict   = wd_dataset['encoding_dict']
hidden_layers = [100,50]
dropout = None

# Build the model. Again you just need to call WideDeep
from utils.torch_model import WideDeep
model = WideDeep(wide_dim,embeddings_input,continuous_cols,deep_column_idx,hidden_layers, dropout, encoding_dict,n_class)

# I have included a compile method if you want to change the fitting method or the optimizer
model.compile(method=method, optimizer="Adam")

In [5]:
print(model)

WideDeep(
  (emb_layer_relationship): Embedding(6, 8)
  (emb_layer_occupation): Embedding(15, 10)
  (emb_layer_native-country): Embedding(42, 10)
  (emb_layer_workclass): Embedding(9, 10)
  (emb_layer_education): Embedding(16, 10)
  (linear_1): Linear(in_features=50, out_features=100, bias=True)
  (linear_2): Linear(in_features=100, out_features=50, bias=True)
  (output): Linear(in_features=848, out_features=1, bias=True)
)


In [6]:
# fit and embedding
train_dataset = wd_dataset['train_dataset']
test_dataset  = wd_dataset['test_dataset']

# As your usual Sklearn model, simply call fit/predict
model.fit(dataset=train_dataset, n_epochs=10, batch_size=64)
pred = model.predict(dataset=test_dataset)

from sklearn.metrics import accuracy_score
print(accuracy_score(pred, test_dataset.labels))

Epoch 1 of 10, Loss: 0.0, accuracy: 1.0
Epoch 2 of 10, Loss: 0.0, accuracy: 1.0
Epoch 3 of 10, Loss: 0.0, accuracy: 1.0
Epoch 4 of 10, Loss: 0.0, accuracy: 1.0
Epoch 5 of 10, Loss: 0.0, accuracy: 1.0
Epoch 6 of 10, Loss: 0.0, accuracy: 1.0
Epoch 7 of 10, Loss: 0.0, accuracy: 1.0
Epoch 8 of 10, Loss: 0.0, accuracy: 1.0
Epoch 9 of 10, Loss: 0.0, accuracy: 1.0
Epoch 10 of 10, Loss: 0.0, accuracy: 1.0
1.0


In [7]:
model.get_embeddings('education')

{'11th': array([ 0.54169065,  0.49417815, -0.05153178, -0.62309986, -1.1947852 ,
         0.18700363, -0.04341362, -0.33709738,  1.3387694 , -0.1855408 ],
       dtype=float32),
 'HS-grad': array([-0.24023673,  0.35751298,  0.5571414 , -0.8167463 ,  0.06322116,
         0.8080706 ,  0.30061978, -0.43353292,  0.4794336 ,  0.61343956],
       dtype=float32),
 'Assoc-acdm': array([ 0.17981398, -0.50020593,  1.0978745 , -0.18372947, -1.0036201 ,
        -0.14446326,  0.5564964 ,  0.34890598,  1.4261824 , -2.2461843 ],
       dtype=float32),
 'Some-college': array([-0.6781258 ,  0.19341134, -0.7988352 ,  0.6347257 , -2.088572  ,
         1.8912214 , -3.1393185 , -0.5383918 , -0.1377146 ,  1.4895263 ],
       dtype=float32),
 '10th': array([-0.36800134, -0.4938993 ,  1.024859  ,  0.09277789, -0.9246003 ,
        -0.85090435, -1.4173918 ,  0.5473981 , -0.6436034 , -0.6557604 ],
       dtype=float32),
 'Prof-school': array([ 0.55081147,  0.9792929 ,  0.6659547 , -0.87960064,  0.7151308 ,
     

### 2.Multiclass classification with fixed embedding dimensions(10), varying dropout and RMSProp

In [8]:
# Let's define age groups
age_groups = [0, 25, 50, 90]
age_labels = range(len(age_groups) - 1)
df['age_group'] = pd.cut(df['age'], age_groups, labels=age_labels)

# Set the experiment
wide_cols = ['hours-per-week','education', 'relationship','workclass',
             'occupation','native-country','gender']
crossed_cols = (['education', 'occupation'], ['native-country', 'occupation'])
embeddings_cols = ['education', 'relationship','workclass','occupation','native-country']
continuous_cols = ["hours-per-week"]
target = 'age_group'
method = 'multiclass'

wd_dataset = prepare_data(df,wide_cols,crossed_cols,embeddings_cols,continuous_cols,target,scale=True,def_dim=10)

wide_dim = wd_dataset['train_dataset'].wide.shape[1]
n_class=3
deep_column_idx = wd_dataset['deep_column_idx']
embeddings_input= wd_dataset['embeddings_input']
encoding_dict   = wd_dataset['encoding_dict']
hidden_layers = [100,50]
dropout = [0.5, 0.2]

model = WideDeep(wide_dim,embeddings_input,continuous_cols,deep_column_idx,hidden_layers,dropout,encoding_dict,n_class)
model.compile(method=method, optimizer="RMSprop")

# Let's have a look to the model
print(model)

WideDeep(
  (emb_layer_relationship): Embedding(6, 10)
  (emb_layer_workclass): Embedding(9, 10)
  (emb_layer_native-country): Embedding(42, 10)
  (emb_layer_occupation): Embedding(15, 10)
  (emb_layer_education): Embedding(16, 10)
  (linear_1): Linear(in_features=51, out_features=100, bias=True)
  (linear_1_drop): Dropout(p=0.5, inplace=False)
  (linear_2): Linear(in_features=100, out_features=50, bias=True)
  (linear_2_drop): Dropout(p=0.2, inplace=False)
  (output): Linear(in_features=847, out_features=3, bias=True)
)


In [9]:
train_dataset = wd_dataset['train_dataset']
model.fit(dataset=train_dataset, n_epochs=10, batch_size=64)
test_dataset  = wd_dataset['test_dataset']

# The model object also has a predict_proba method in case you want probabilities instead of class
pred = model.predict_proba(test_dataset)
print('\n {}'.format(pred))

  out = self.activation(self.output(wide_deep_input))


Epoch 1 of 10, Loss: 0.887, accuracy: 0.6785
Epoch 2 of 10, Loss: 0.91, accuracy: 0.6875
Epoch 3 of 10, Loss: 0.848, accuracy: 0.6903
Epoch 4 of 10, Loss: 0.797, accuracy: 0.6927
Epoch 5 of 10, Loss: 1.133, accuracy: 0.6944
Epoch 6 of 10, Loss: 1.072, accuracy: 0.6974
Epoch 7 of 10, Loss: 0.972, accuracy: 0.6976
Epoch 8 of 10, Loss: 0.721, accuracy: 0.7002
Epoch 9 of 10, Loss: 0.718, accuracy: 0.6993
Epoch 10 of 10, Loss: 0.633, accuracy: 0.6998

 [[2.9131240e-01 7.0733297e-01 1.3546958e-03]
 [4.1371289e-01 5.8624840e-01 3.8669281e-05]
 [9.9740285e-01 2.5710945e-03 2.6083995e-05]
 ...
 [7.5831962e-01 2.1754092e-01 2.4139451e-02]
 [6.7187249e-11 9.9999678e-01 3.1666616e-06]
 [1.3452404e-07 9.9978524e-01 2.1469337e-04]]


### 3. Linear regression with varying embedding dimensions and varying dropout

In [10]:
# Set the experiment
wide_cols = ['hours-per-week','education', 'relationship','workclass',
             'occupation','native-country','gender']
crossed_cols  = (['education', 'occupation'], ['native-country', 'occupation'])
embeddings_cols  = [('education',10), ('relationship',8), ('workclass',10),
                    ('occupation',10),('native-country',10)]
continuous_cols = ["hours-per-week"]
target = 'age'
method = 'regression'

# Prepare the dataset
wd_dataset = prepare_data(df, wide_cols,crossed_cols,embeddings_cols,continuous_cols,target)

wide_dim = wd_dataset['train_dataset'].wide.shape[1]
n_class=1
deep_column_idx = wd_dataset['deep_column_idx']
embeddings_input= wd_dataset['embeddings_input']
encoding_dict   = wd_dataset['encoding_dict']
hidden_layers = [100,50]
dropout = [0.5, 0.2]
model = WideDeep(wide_dim,embeddings_input,continuous_cols,deep_column_idx,hidden_layers,dropout,encoding_dict,n_class)
model.compile(method=method)
print(model)

WideDeep(
  (emb_layer_relationship): Embedding(6, 8)
  (emb_layer_workclass): Embedding(9, 10)
  (emb_layer_native-country): Embedding(42, 10)
  (emb_layer_occupation): Embedding(15, 10)
  (emb_layer_education): Embedding(16, 10)
  (linear_1): Linear(in_features=49, out_features=100, bias=True)
  (linear_1_drop): Dropout(p=0.5, inplace=False)
  (linear_2): Linear(in_features=100, out_features=50, bias=True)
  (linear_2_drop): Dropout(p=0.2, inplace=False)
  (output): Linear(in_features=847, out_features=1, bias=True)
)


In [11]:
train_dataset = wd_dataset['train_dataset']
model.fit(dataset=train_dataset, n_epochs=10, batch_size=64)

test_dataset  = wd_dataset['test_dataset']
pred = model.predict(test_dataset)

from sklearn.metrics import mean_squared_error
print("\n RMSE: {}".format(np.sqrt(mean_squared_error(pred, test_dataset.labels))))

Epoch 1 of 10, Loss: 361.227
Epoch 2 of 10, Loss: 120.188
Epoch 3 of 10, Loss: 185.024
Epoch 4 of 10, Loss: 169.754
Epoch 5 of 10, Loss: 139.843
Epoch 6 of 10, Loss: 167.409
Epoch 7 of 10, Loss: 147.47
Epoch 8 of 10, Loss: 53.157
Epoch 9 of 10, Loss: 177.697
Epoch 10 of 10, Loss: 138.325

 RMSE: 11.311551226299784
