In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, confusion_matrix

import torch
import torch.nn as nn

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
%%time
df = pd.read_excel('./indeed_results_pp_2020-04-27.xlsx')

Wall time: 21.5 s


In [3]:
d = df[(df['Title_New'].notnull()) & ((df['Education_Tenth'] == 1) | (df['Education_Twelvth'] == 1) | (df['Education_Bachelors'] == 1) | (df['Education_Masters'] == 1) | (df['Education_Doctorate'] == 1))].reset_index()[['Title_New', 'Education_Tenth', 'Education_Twelvth', 'Education_Bachelors', 'Education_Masters', 'Education_Doctorate']]
d['Title_New'] = d['Title_New'].astype('category')
d['Categorical_Title'] = d['Title_New'].cat.codes
d['Categorical_Title'].nunique()

16

In [4]:
cat_cols = ['Education_Tenth', 'Education_Twelvth', 'Education_Bachelors', 'Education_Masters', 'Education_Doctorate']
cont_cols = []
y_col = ['Categorical_Title']

In [5]:
cats = np.stack([d[col] for col in cat_cols], axis=1)

In [6]:
cats = torch.tensor(cats, dtype=torch.int64)

In [7]:
cats

tensor([[0, 0, 1, 0, 0],
        [0, 1, 1, 0, 0],
        [0, 0, 1, 0, 0],
        ...,
        [0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0]])

In [8]:
y = torch.tensor(d[y_col].values, dtype=torch.int64).flatten()

In [9]:
y.shape

torch.Size([5340])

In [10]:
cats.shape

torch.Size([5340, 5])

In [11]:
cat_szs = [len(df[col]) for col in cat_cols]
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]
emb_szs

[(90700, 50), (90700, 50), (90700, 50), (90700, 50), (90700, 50)]

In [12]:
catz = cats[:4]
catz

tensor([[0, 0, 1, 0, 0],
        [0, 1, 1, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 1, 0, 0]])

In [13]:
selfembeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
selfembeds

ModuleList(
  (0): Embedding(90700, 50)
  (1): Embedding(90700, 50)
  (2): Embedding(90700, 50)
  (3): Embedding(90700, 50)
  (4): Embedding(90700, 50)
)

In [14]:
list(enumerate(selfembeds))

[(0, Embedding(90700, 50)),
 (1, Embedding(90700, 50)),
 (2, Embedding(90700, 50)),
 (3, Embedding(90700, 50)),
 (4, Embedding(90700, 50))]

In [15]:
embeddingz = []
for i,e in enumerate(selfembeds):
    embeddingz.append(e(catz[:,i]))
embeddingz

[tensor([[-0.8185,  0.3877,  0.7050, -2.0907, -0.2656, -0.1864,  0.6985,  1.9327,
          -0.3256,  1.7668, -0.9019, -0.6609,  1.0036,  0.0885, -0.3997, -0.8624,
          -0.2323, -0.6436, -0.4823,  0.7508, -0.0664,  0.0973,  0.9103, -0.1448,
          -0.4140, -0.0724,  0.0546, -1.0172, -0.0495,  1.1582,  1.5203, -0.6304,
          -0.9390,  1.3486, -1.4471,  0.9253,  0.5573,  0.1720,  0.6397, -0.3907,
           2.1210,  0.3112,  2.5571, -1.1631,  0.6824,  0.3214,  0.7971,  0.2041,
          -1.7472,  0.2057],
         [-0.8185,  0.3877,  0.7050, -2.0907, -0.2656, -0.1864,  0.6985,  1.9327,
          -0.3256,  1.7668, -0.9019, -0.6609,  1.0036,  0.0885, -0.3997, -0.8624,
          -0.2323, -0.6436, -0.4823,  0.7508, -0.0664,  0.0973,  0.9103, -0.1448,
          -0.4140, -0.0724,  0.0546, -1.0172, -0.0495,  1.1582,  1.5203, -0.6304,
          -0.9390,  1.3486, -1.4471,  0.9253,  0.5573,  0.1720,  0.6397, -0.3907,
           2.1210,  0.3112,  2.5571, -1.1631,  0.6824,  0.3214,  0.79

In [16]:
z = torch.cat(embeddingz, 1)
z

tensor([[-8.1852e-01,  3.8768e-01,  7.0505e-01, -2.0907e+00, -2.6558e-01,
         -1.8641e-01,  6.9851e-01,  1.9327e+00, -3.2558e-01,  1.7668e+00,
         -9.0193e-01, -6.6094e-01,  1.0036e+00,  8.8460e-02, -3.9966e-01,
         -8.6242e-01, -2.3233e-01, -6.4360e-01, -4.8235e-01,  7.5081e-01,
         -6.6388e-02,  9.7272e-02,  9.1030e-01, -1.4477e-01, -4.1400e-01,
         -7.2437e-02,  5.4559e-02, -1.0172e+00, -4.9457e-02,  1.1582e+00,
          1.5203e+00, -6.3041e-01, -9.3903e-01,  1.3486e+00, -1.4471e+00,
          9.2530e-01,  5.5727e-01,  1.7202e-01,  6.3970e-01, -3.9074e-01,
          2.1210e+00,  3.1124e-01,  2.5571e+00, -1.1631e+00,  6.8237e-01,
          3.2136e-01,  7.9714e-01,  2.0407e-01, -1.7472e+00,  2.0573e-01,
         -7.0586e-01,  3.8026e-01, -6.7050e-01,  1.4750e-01, -3.8700e-01,
          4.2402e-02,  1.1857e+00, -1.1056e+00,  2.1377e-01, -1.7548e-01,
         -1.3172e-01,  1.2053e+00, -5.8079e-01, -4.8451e-01, -3.2232e-01,
         -3.2061e-01,  1.9132e-01,  1.

In [17]:
selfembdrop = nn.Dropout(.4)

In [18]:
z = selfembdrop(z)
z

tensor([[-0.0000e+00,  6.4613e-01,  0.0000e+00, -3.4845e+00, -4.4264e-01,
         -0.0000e+00,  0.0000e+00,  3.2212e+00, -0.0000e+00,  2.9447e+00,
         -1.5032e+00, -1.1016e+00,  1.6727e+00,  0.0000e+00, -6.6611e-01,
         -1.4374e+00, -0.0000e+00, -1.0727e+00, -0.0000e+00,  0.0000e+00,
         -1.1065e-01,  0.0000e+00,  1.5172e+00, -2.4128e-01, -0.0000e+00,
         -1.2073e-01,  9.0932e-02, -1.6953e+00, -0.0000e+00,  1.9303e+00,
          0.0000e+00, -1.0507e+00, -0.0000e+00,  2.2476e+00, -2.4118e+00,
          0.0000e+00,  9.2878e-01,  0.0000e+00,  1.0662e+00, -0.0000e+00,
          0.0000e+00,  0.0000e+00,  4.2619e+00, -0.0000e+00,  0.0000e+00,
          0.0000e+00,  1.3286e+00,  0.0000e+00, -2.9120e+00,  3.4288e-01,
         -1.1764e+00,  6.3376e-01, -0.0000e+00,  2.4584e-01, -6.4500e-01,
          7.0671e-02,  1.9761e+00, -1.8427e+00,  3.5628e-01, -0.0000e+00,
         -2.1953e-01,  0.0000e+00, -9.6798e-01, -8.0751e-01, -5.3720e-01,
         -5.3434e-01,  0.0000e+00,  2.

In [19]:
class TabularModel(nn.Module):

    def __init__(self, emb_szs, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        
        layerlist = []
        n_emb = sum((nf for ni,nf in emb_szs))
        n_in = n_emb
        
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
            
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x = torch.cat([x], 1)
        x = self.layers(x)
        return x

In [20]:
torch.manual_seed(33)
model = TabularModel(emb_szs, 16, [200, 100, 50, 25, 50, 100, 200], p=0.4)

In [21]:
model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(90700, 50)
    (1): Embedding(90700, 50)
    (2): Embedding(90700, 50)
    (3): Embedding(90700, 50)
    (4): Embedding(90700, 50)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=250, out_features=400, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=400, out_features=200, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=200, out_features=100, bias=True)
    (9): ReLU(inplace=True)
    (10): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=100, out_features=200, bias=True)
    (13): ReLU(inp

In [22]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
%%time
epochs = 5000
losses = []

for i in range(epochs):
    y_pred = model(cats)
    loss = criterion(y_pred, y)
    losses.append(loss)
    
    if (i+1)%500 == 0:
        print(f'epoch: {(i+1):5}  loss: {loss.item():10.8f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

epoch:   200  loss: 2.17805433


In [None]:
plt.plot(range(epochs), losses)
plt.ylabel('Cross Entropy Loss')
plt.xlabel('epoch');

In [None]:
with torch.no_grad():
    y_val = model(cats)
    loss = criterion(y_val, y)
print(f'CE Loss: {loss:.8f}')

In [None]:
rows = d.shape[0]
correct = 0
print(f'{"MODEL OUTPUT":26} ARGMAX  Y_TEST')
for i in range(rows):
    print(f'{str(y_val[i]):26} {y_val[i].argmax():^7}{y[i]:^7}')
    if y_val[i].argmax().item() == y[i]:
        correct += 1
print(f'\n{correct} out of {rows} = {100*correct/rows:.2f}% correct')