In [19]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [34]:
df = pd.read_csv("../Data/income.csv")
df.head()

Unnamed: 0,age,sex,education,education-num,marital-status,workclass,occupation,hours-per-week,income,label
0,27,Male,HS-grad,9,Never-married,Private,Craft-repair,40,<=50K,0
1,47,Male,Masters,14,Married,Local-gov,Exec-managerial,50,>50K,1
2,59,Male,HS-grad,9,Divorced,Self-emp,Prof-specialty,20,<=50K,0
3,38,Female,Prof-school,15,Never-married,Federal-gov,Prof-specialty,57,>50K,1
4,64,Female,11th,7,Widowed,Private,Farming-fishing,40,<=50K,0


In [24]:
df.describe()

Unnamed: 0,age,education-num,hours-per-week,label
count,30000.0,30000.0,30000.0,30000.0
mean,37.1441,10.280133,41.384033,0.276667
std,12.856173,2.479556,9.684555,0.447358
min,18.0,3.0,20.0,0.0
25%,26.0,9.0,40.0,0.0
50%,36.0,10.0,40.0,0.0
75%,46.0,13.0,45.0,1.0
max,90.0,16.0,90.0,1.0


In [25]:
df.columns

Index(['age', 'sex', 'education', 'education-num', 'marital-status',
       'workclass', 'occupation', 'hours-per-week', 'income', 'label'],
      dtype='object')

In [39]:

cat_cols = ['sex', 'education', 'marital-status', 'workclass', 'occupation']
cont_cols = ['age', 'education-num', 'hours-per-week']
y_col = 'label'

for cat in cat_cols:
    df[cat]= df[cat].astype('category')

df.dtypes

age                  int64
sex               category
education         category
education-num        int64
marital-status    category
workclass         category
occupation        category
hours-per-week       int64
label                int64
dtype: object

In [17]:
df['education'].cat.categories

Index(['10th', '11th', '12th', '5th-6th', '7th-8th', '9th', 'Assoc-acdm',
       'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters',
       'Prof-school', 'Some-college'],
      dtype='object')

In [18]:
df['education'].cat.codes

0        10
1        11
2        10
3        12
4         1
         ..
29995    11
29996    10
29997    12
29998    13
29999     6
Length: 30000, dtype: int8

In [63]:
cats = np.stack([df[cat].cat.codes.values for cat in cat_cols], 1)
cats = torch.tensor(cats, dtype=torch.int32)
cats

tensor([[ 1, 10,  3,  2,  1],
        [ 1, 11,  1,  1,  2],
        [ 1, 10,  0,  3,  7],
        ...,
        [ 1, 12,  1,  2,  7],
        [ 0, 13,  3,  2,  0],
        [ 1,  6,  1,  3,  2]], dtype=torch.int32)

In [64]:
conts = np.stack([df[cont].values for cont in cont_cols], 1)
conts = torch.tensor(conts, dtype=torch.float)
conts

tensor([[27.,  9., 40.],
        [47., 14., 50.],
        [59.,  9., 20.],
        ...,
        [47., 15., 55.],
        [32., 10., 40.],
        [33., 12., 60.]])

In [74]:
y = torch.tensor(df[y_col].values).flatten()
y

tensor([0, 1, 0,  ..., 1, 0, 1])

In [45]:
cat_szs = [len(df[cat].cat.categories) for cat in cat_cols]
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]
emb_szs 

[(2, 1), (14, 7), (6, 3), (5, 3), (12, 6)]

In [None]:
# emb_szs = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in emb_szs])
# emb_szs 

ModuleList(
  (0): Embedding(2, 1)
  (1): Embedding(14, 7)
  (2): Embedding(6, 3)
  (3): Embedding(5, 3)
  (4): Embedding(12, 6)
)

In [66]:
class TabularModel(nn.Module):

    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
        # Call the parent __init__
        super().__init__()
        
        # Set up the embedding, dropout, and batch normalization layer attributes
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        # Assign a variable to hold a list of layers
        layerlist = []
        
        # Assign a variable to store the number of embedding and continuous layers
        n_emb = sum((nf for ni,nf in emb_szs))
        n_in = n_emb + n_cont
        
        # Iterate through the passed-in "layers" parameter (ie, [200,100]) to build a list of layers
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
        
        # Convert the list of layers into an attribute
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat, x_cont):
        # Extract embedding values from the incoming categorical data
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        # Perform an initial dropout on the embeddings
        x = self.emb_drop(x)
        
        # Normalize the incoming continuous data
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        
        # Set up model layers
        x = self.layers(x)
        return x





In [67]:
torch.manual_seed(33)

model = TabularModel(emb_szs, conts.shape[1], 2, [100, 50], 0.4)

In [68]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr= 0.01)

In [75]:

batch_size = 30000
test_size = 5000

cats_train = cats[:batch_size - test_size]
cats_test = cats[batch_size - test_size : batch_size]

conts_train = conts[:batch_size - test_size]
conts_test = conts[batch_size - test_size : batch_size]

y_train = y[:batch_size - test_size]
y_test = y[batch_size - test_size : batch_size]


# Print shapes of your input data
print("cats_train shape:", cats_train.shape)
print("conts_train shape:", conts_train.shape)
print("conts_train shape:", conts.shape[1])

# Print emb_szs
print("emb_szs:", emb_szs)

# Calculate total embedding size
total_emb_size = sum(nf for _, nf in emb_szs)
print("Total embedding size:", total_emb_size)


cats_train shape: torch.Size([25000, 5])
conts_train shape: torch.Size([25000, 3])
conts_train shape: 3
emb_szs: [(2, 1), (14, 7), (6, 3), (5, 3), (12, 6)]
Total embedding size: 20


In [77]:
import time

start_time = time.time()
epochs = 200
losses = []

for i in range(epochs):
    y_pred = model.forward(cats_train, conts_train)
    loss = criterion(y_pred, y_train)
    losses.append(loss)
    if i % 25 == 1:
        print(f'epoch: {i:3} loss: {loss.item(): 10.8f}')
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'epoch: {i:3} loss: { loss.item(): 10.8f}')
print(f'\n Duration: {time.time() - start_time: .0f} seconds')

epoch:   1 loss:  0.26043811
epoch:  26 loss:  0.26058835
epoch:  51 loss:  0.25956106
epoch:  76 loss:  0.25862697
epoch: 101 loss:  0.25732994
epoch: 126 loss:  0.25642684
epoch: 151 loss:  0.25624490
epoch: 176 loss:  0.25535360
epoch: 199 loss:  0.25425327

 Duration:  10 seconds


In [80]:
with torch.no_grad():
    y_val = model.forward(cats_test, conts_test)
    loss = criterion(y_val, y_test)
print(f'CE Loss: {loss:.8f}')

CE Loss: 0.26949391


In [None]:
rows = len(y_test)
correct = 0


for i in range(rows):

    if y_val[i].argmax().item() == y_test[i]:
        correct += 1

print(f'\n{correct} out of {rows} = {100*correct/rows:.2f}% correct')


4384 out of 5000 = 87.68% correct
