In [2]:
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, Dataset


plt.rcParams['figure.figsize'] = [16, 4]

In [3]:
num_rows = int(131072 / 2)
num_features = 8

X, target = make_classification(n_samples=num_rows, n_features=num_features)

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [4]:
Counter(target)

Counter({1: 32763, 0: 32773})

In [5]:
threshold = np.median(target)

zero_pairs = np.where(target == 0)[0]
zero_pairs = np.random.choice(zero_pairs, size=(65536, 2), replace=True)

one_pairs = np.where(target == 1)[0]
one_pairs = np.random.choice(one_pairs, size=(65536, 2), replace=True)

In [6]:
class TabularSiameseDataset(Dataset):
	
	def __init__(self, data, pairs):
		self.data = data
		self.pairs = pairs
	
	def __len__(self):
		return len(self.pairs)
	
	def __getitem__(self, idx):
		index1, index2, label = self.pairs[idx]
		return self.data[index1], self.data[index2], label


pairs = [(zero_pairs[i, 0], zero_pairs[i, 1], 0) for i in range(len(zero_pairs))] + \
        [(one_pairs[i, 0], one_pairs[i, 1], 1) for i in range(len(one_pairs))]

dataset = TabularSiameseDataset(X, pairs)
dataloader = DataLoader(dataset, batch_size=10, shuffle=True)

In [7]:
#

In [8]:
class ContrastiveLoss(nn.Module):
	
	def __init__(self, margin=1.0):
		super(ContrastiveLoss, self).__init__()
		self.margin = margin
	
	def forward(self, output1, output2, label):
		cosine_similarity = nn.functional.cosine_similarity(output1, output2)
		loss = (1 - label) * torch.pow(cosine_similarity, 2) + \
		       label * torch.pow(torch.clamp(self.margin - cosine_similarity, min=0.0), 2)
		return loss.mean()

In [9]:
loss_fn = ContrastiveLoss()

In [10]:
#

In [11]:
class SiameseNetwork(nn.Module):
	
	def __init__(self, input_size):
		super(SiameseNetwork, self).__init__()
		self.net = nn.Sequential(
				nn.Linear(input_size, 32),
				nn.ReLU(),
				nn.Linear(32, 16),
				nn.ReLU(),
				nn.Linear(16, 8)
				)
	
	def forward(self, input1, input2):
		output1 = self.net(input1)
		output2 = self.net(input2)
		return output1, output2


input_size = X.shape[1]
model = SiameseNetwork(input_size)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [12]:
def train(dataloader, model, loss_fn, optimizer, epochs):
	model.train()
	
	for epoch in range(epochs):
		running_loss = 0.0
		for data1, data2, labels in dataloader:
			data1 = data1.float()
			data2 = data2.float()
			
			optimizer.zero_grad()
			outputs1, outputs2 = model(data1, data2)
			loss = loss_fn(outputs1, outputs2, labels)
			loss.backward()
			optimizer.step()
			running_loss += loss.item()
		
		print(f'Epoch {epoch + 1}, Loss: {running_loss / len(dataloader)}')

In [13]:
train(dataloader, model, loss_fn, optimizer, epochs=10)

Epoch 1, Loss: 0.12168778084642679
Epoch 2, Loss: 0.09523369789578393
Epoch 3, Loss: 0.09029894289071282
Epoch 4, Loss: 0.08876177884220077
Epoch 5, Loss: 0.08874132577761613
Epoch 6, Loss: 0.08742491776010194
Epoch 7, Loss: 0.08666748772478786
Epoch 8, Loss: 0.08602169108741167
Epoch 9, Loss: 0.08531260282554784
Epoch 10, Loss: 0.08550605485177527


In [15]:
input_tensor = torch.tensor(X, dtype=torch.float)
with torch.no_grad():
	embeddings = model.net(input_tensor)

print(embeddings)

tensor([[ 137.5767,  -32.3416,  -17.5775,  ...,  -10.9405,   23.2730,
            1.2067],
        [ 696.0217,  -91.0634,   24.6970,  ...,   27.9498,   38.8960,
           98.3119],
        [  12.6512,   23.2901,  -66.5433,  ...,  -44.2399,   -3.0427,
           12.4173],
        ...,
        [ 702.3848, -121.5195,  -32.6355,  ...,   -7.6106,   79.7274,
           60.5355],
        [  26.3347,    4.5656,   -9.8445,  ...,    3.9515,   -1.8407,
          -14.6493],
        [ 211.9185,  -44.1644,  -21.3051,  ...,  -11.1212,   31.9979,
            9.0213]])
