In [23]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.metrics import *
from eval import *
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader

In [5]:
data = pd.read_csv("/Users/rkim0927/Python/Data/VB/dating_suggestions.csv")

test_period_start, test_period_end = '2022-02-01', '2022-02-15'
train_interaction = data[(data['created_at'] > '2021-11-30') & (data['created_at'] < test_period_start)]
test_interaction = data[(data['created_at'] > test_period_start) & (data['created_at'] < test_period_end)]

unique_users = list(set(pd.concat([train_interaction["source_id"], train_interaction["user_id"]])))
test_interaction = test_interaction[(test_interaction["user_id"].isin(unique_users)) & (test_interaction["source_id"].isin(unique_users))]
print(len(unique_users))

user_to_idx, idx_to_user = {}, {}
for i in tqdm(range(len(unique_users))):
  user_to_idx[unique_users[i]] = i
  idx_to_user[i] = unique_users[i]

21392


  0%|          | 0/21392 [00:00<?, ?it/s]

In [13]:
embeddings_src, embeddings_dst = pd.read_csv('/Users/rkim0927/Python/Data/VB/Embeddings/embeddings_user_matchsage.csv'), pd.read_csv('/Users/rkim0927/Python/Data/VB/Embeddings/embeddings_item_matchsage.csv')
# embeddings_src, embeddings_dst = pd.DataFrame(embeddings_src, index = [x[1] for x in idx_to_user.items()]), pd.DataFrame(embeddings_dst, index = [x[1] for x in idx_to_user.items()])

embeddings_src.set_index("Unnamed: 0", drop = True, inplace = True)
embeddings_dst.set_index("Unnamed: 0", drop = True, inplace = True)

src_to_embedding, dst_to_embedding = dict(), dict()

for i in tqdm(range(len(embeddings_src))):
  src_to_embedding[embeddings_src.index[i]] = embeddings_src.iloc[i].values
  dst_to_embedding[embeddings_dst.index[i]] = embeddings_dst.iloc[i].values

  0%|          | 0/21392 [00:00<?, ?it/s]

In [7]:
class MLPDataset(Dataset):
  def __init__(self, X1, X2, Y):
    self.X1, self.X2 = X1, X2
    self.Y = Y
  
  def __getitem__(self, idx):
    return self.X1[idx], self.X2[idx],self.Y[idx]

  def __len__(self):
    return len(self.X1)

class NCF(nn.Module):
  def __init__(self, input_dim, hidden_size, dropout, classify):
    super(NCF, self).__init__()
    self.classify = classify
    self.dropout = nn.Dropout(dropout)
    self.relu = nn.ReLU()
    self.bn = nn.BatchNorm1d(hidden_size)
    self.dense_1 = nn.Linear(input_dim, hidden_size)
    self.dense_2 = nn.Linear(input_dim, hidden_size)
    self.dense =  nn.Linear(hidden_size*2, hidden_size)
    self.dense_out = nn.Linear(hidden_size, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x1, x2):
    x1, x2 = self.relu(self.dense_1(x1)), self.relu(self.dense_2(x2))
    x = torch.cat((x1, x2), axis = 1)
    x = self.relu(self.dense(x))
    x = self.dense_out(x)
    if self.classify:
      x = self.sigmoid(x)
    return x.squeeze()

In [17]:
np.random.seed(777)
num_train = 1000000


idx = np.random.choice(np.arange(len(train_interaction)), 1000000, replace=False)
train_data = train_interaction.iloc[idx]

X_train_user = train_data["user_id"].map(src_to_embedding).values
X_train_item = train_data["source_id"].map(dst_to_embedding).values
y_train =train_data["accepted"].values

X_test_user =  test_interaction["user_id"].map(src_to_embedding).values
X_test_item = test_interaction["source_id"].map(dst_to_embedding).values
y_test = test_interaction["accepted"].values

print(X_train_user.shape, X_train_item.shape, y_train.shape, X_test_user.shape, X_test_item.shape, y_test.shape )

trainData = MLPDataset(X_train_user, X_train_item, y_train)
testData = MLPDataset(X_test_user, X_test_item, y_test)
trainLoader = DataLoader(trainData, batch_size = 256, shuffle = True)
testLoader = DataLoader(testData, batch_size = 256, shuffle = False)

(1000000,) (1000000,) (1000000,) (160678,) (160678,) (160678,)


In [24]:
HIDDEN_SIZE = 100
EMBEDDING_DIM = len(X_train_user[0])
DROPOUT = .5
NUM_EPOCHS = 5
LEARNING_RATE = 1e-3

device = "mps" if torch.backends.mps.is_built() else "gpu" if torch.cuda.is_available() else "cpu"
mlp = NCF(EMBEDDING_DIM, HIDDEN_SIZE, DROPOUT, True).to(device)
optimizer = torch.optim.Adam(mlp.parameters(), lr=LEARNING_RATE)  
criterion = nn.BCELoss()

roauc, prauc = [], []
for epoch in range(NUM_EPOCHS):
  for i, (x1, x2, y) in enumerate(trainLoader):
    x1, x2, y = x1.float().to(device), x2.float().to(device),y.float().to(device)
    
    outputs = mlp(x1, x2)
    loss = criterion(outputs, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  
  print("Epoch: {}, Loss: {:.5f}".format(epoch + 1, loss.item()))

  y_prob = []

  with torch.no_grad():
    for x1, x2, y in testLoader:
      x1, x2, y = x1.float().to(device), x2.float().to(device),y.float().to(device)

      outputs = mlp(x1, x2)
      y_prob += list(outputs.cpu().numpy())
  
  r, p = roc_auc_score(y_test, y_prob), average_precision_score(y_test, y_prob)
  print(r, p)
  roauc.append(r)
  prauc.append(p)

Epoch: 1, Loss: 0.38113
0.8192283916312013 0.478293804256633
Epoch: 2, Loss: 0.38551
0.8242441631005877 0.48514981152247283
Epoch: 3, Loss: 0.27745
0.823623284896424 0.4843952971411064
Epoch: 4, Loss: 0.37157
0.8242687319977 0.4885144039668089
Epoch: 5, Loss: 0.40478
0.8254456622696548 0.4907497835746387
