In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision.utils
from torch import optim
from torch.utils.data import DataLoader,Dataset
from torchvision.models import *
from torchvision.datasets import ImageFolder
from torch.autograd import Variable

import random
import pandas as pd
import numpy as np

In [3]:
groupDF = pd.read_csv('Data/groupDF_clean.csv', index_col=0)
groupDF.drop('rem_company_id_dummy', axis=1, inplace=True)

In [4]:
Xcols = groupDF.columns.tolist()
Xcols.remove('payroll_ind')

In [5]:
X_pair = groupDF.drop('payroll_ind', axis=1)
Y_pair = groupDF.drop(Xcols, axis=1)

In [6]:
len(X_pair[Y_pair['payroll_ind']==1])/(len(X_pair[Y_pair['payroll_ind']==1])+len(X_pair[Y_pair['payroll_ind']==0]))

0.05009019009296517

In [7]:
class create_train_dataset(Dataset):
    '''
    Makes dataset with pairs of examples

    '''
    
    def __init__(self, X, Y):
        self.X_train = X
        self.Y_train = Y 
        self.indices = self.X_train.index
        self.indices_01 = (self.X_train[self.Y_train['payroll_ind']==0].index, self.X_train[self.Y_train['payroll_ind']==1].index)
        
    def __getitem__(self, index):
        pair1 = random.choice(self.indices)                    #### MOST OF PAIR1 will be from class == 0
        Y1 = self.Y_train.loc[random.choice(self.X_train.index)].values[0]  # random.randint(0, 1) # since there is class imbalance??
        
        # Approx 50% of images should be same class
        same_class = random.randint(0, 1)
        if same_class:
            Y2 = Y1
        else:
            Y2 = abs(1-Y1)
        pair2 = random.choice(self.indices_01[Y2])
        
        X1 = torch.from_numpy(self.X_train.loc[pair1].values.reshape(1, -1))
        X2 = torch.from_numpy(self.X_train.loc[pair2].values.reshape(1, -1))
        label = torch.from_numpy(np.array([Y1!=Y2], dtype=np.float32))        # == or != ?? 

        return X1, X2, label 
    
    def __len__(self):
        return len(self.X_train)


In [8]:
train_data_siam = create_train_dataset(X=X_pair, Y=Y_pair)

In [9]:
visDL = DataLoader(train_data_siam, shuffle=True, num_workers=0, batch_size=8)
dataiter = iter(visDL)
example_batch = next(dataiter)
(example_batch[-1])

tensor([[0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.]])

In [44]:
class SiameseNN(nn.Module):
    def __init__(self, input_size=29, output_size=8):
        super(SiameseNN, self).__init__()

        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)  # hidden layer 1
        self.fc3 = nn.Linear(32, 16)  # hidden layer 2
        self.fc4 = nn.Linear(16, 8)   

    def forward_single(self, x):
        
        x = F.tanh(self.fc1(x))
        x = F.tanh(self.fc2(x))
        x = F.tanh(self.fc3(x))
        x = F.tanh(self.fc4(x))
        x = F.softmax(self.fc5(x))
        return x
    
    def forward(self, x1, x2):
        output1 = self.forward_single(x1)
        output2 = self.forward_single(x2)
        return output1, output2

- remove conv
- mahalanobis dist instea of euc
- triplet loss
- strategies for sampling -> easy, hard. 

In [45]:

class ContrastiveLoss(torch.nn.Module):
    def __init__(self, margin=2):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin
        
    def forward(self, output1, output2, label):
        euc_dist = F.pairwise_distance(output1, output2, keepdim=True)     
        loss = torch.mean((1-label)*torch.pow(euc_dist, 2) + label*torch.pow(torch.clamp(self.margin-euc_dist, min=0.0), 2))
        return loss

In [46]:
train_data_siam = create_train_dataset(X=X_pair, Y=Y_pair)
train_loader = DataLoader(train_data_siam, shuffle=True, num_workers=0, batch_size=8)

In [47]:
net = SiameseNN().cuda()

In [48]:
criterion = ContrastiveLoss()
opt = optim.Adam(net.parameters(), lr=0.001)

In [49]:
loss_ep = []
for epoch in range(0, 100):
    loss_iter = 0
    count = 1
    for data in train_loader:
        X1, X2, label = data
        X1, X2, label = X1.type(torch.FloatTensor), X2.type(torch.FloatTensor), label.type(torch.FloatTensor)
        X1, X2, label = X1.cuda(), X2.cuda(), label.cuda()
        opt.zero_grad()
        output1, output2 = net(X1, X2)
        loss = criterion(output1, output2, label)
        loss.backward()
        loss_iter += loss.item()
        opt.step()
        
        if count%1000==1:
            print('Epoch: {0}  Loss: {1}, n_examples: {2}'.format(epoch, loss_iter/(8*count), 8*count))
        count += 1
    print('Epoch: {0}  Loss: {1}, n_examples: {2}'.format(epoch, loss_iter/len(train_loader), len(train_loader)))
    loss_ep.append(loss_iter)
        



Epoch: 0  Loss: 0.28095364570617676, n_examples: 8


KeyboardInterrupt: 

381971

In [104]:
len(train_loader)

47747

In [106]:
import imblearn