In [22]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm
from torch.utils.tensorboard import SummaryWriter  # Import the tensorboard writer
import numpy as np

In [2]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

# load user data

In [3]:
answer_sheet = '../samples/demo_houses/user_features.csv'
df_user = pd.read_csv(answer_sheet).drop(columns='Unnamed: 0')
print(df_user.shape[0])
df_user.userGender = df_user.userGender - 1 # avoid index error
# print(df_user.userAgeIds.unique())
df_user.head()
print(sum(df_user.Q1 == 0))

3800
657


# start to encode the user attr

In [6]:
num_unique_ids = {
    'userGender': 2,
    'userAgeIds': df_user.userAgeIds.max() + 1,
    'userPrefectureIds': df_user.userPrefectureIds.nunique(),
}

embedding_fn = {
    'userGender': nn.Embedding(num_embeddings=num_unique_ids['userGender'], embedding_dim=min(50, int(num_unique_ids['userGender']/2))),
    'userAgeIds': nn.Embedding(num_embeddings=num_unique_ids['userAgeIds'], embedding_dim=min(50, int(num_unique_ids['userAgeIds']/2))),
    'userPrefectureIds': nn.Embedding(num_embeddings=num_unique_ids['userPrefectureIds'], embedding_dim=min(50, int(num_unique_ids['userPrefectureIds']/2))),
}
print(num_unique_ids)
print(embedding_fn)
    

{'userGender': 2, 'userAgeIds': 15, 'userPrefectureIds': 48}
{'userGender': Embedding(2, 1), 'userAgeIds': Embedding(15, 7), 'userPrefectureIds': Embedding(48, 24)}


In [7]:
user_data_dict= {
    'userGender':  torch.LongTensor(df_user['userGender'].values).unsqueeze(1),
    'userAgeIds':  torch.LongTensor(df_user['userAgeIds'].values).unsqueeze(1),
    'userPrefectureIds': torch.LongTensor(df_user['userPrefectureIds'].values).unsqueeze(1),
}
print(user_data_dict['userGender'].shape)

torch.Size([3800, 1])


In [8]:
# label_encoder = LabelEncoder()
# df_user['Q_label'] = label_encoder.fit_transform(df_user['Q1'])
# Q_labels = torch.tensor(df_user['Q_label'].values)
# print(Q_labels)
Q_labels = torch.tensor(df_user['Q1'].values * 2).float() # *2 to turn -1 ~1 to -2 ~2 as it is cosine diff
print(Q_labels.dtype)

torch.float32


# define model to calculate the representation for user and images

In [9]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=2):
        super(MLP, self).__init__()
        self.layers = nn.ModuleList()
        
        # Add the first layer
        self.layers.append(nn.Linear(input_size, hidden_size))
        
        # Add hidden layers
        for _ in range(n_layers - 2):
            self.layers.append(nn.Linear(hidden_size, hidden_size))
        
        # Add the output layer
        self.layers.append(nn.Linear(hidden_size, output_size))
    def forward(self, x):
        # Apply ReLU to all layers except the last one
        for layer in self.layers[:-1]:
            x = F.relu(layer(x))
        
        # Output layer without ReLU
        x = self.layers[-1](x)
        return x

    

In [10]:
class UserAttributeEmbeddings(nn.Module):
    def __init__(self, embedding_fn):
        super(UserAttributeEmbeddings, self).__init__()
        self.embedding_userGender = embedding_fn['userGender']
        self.embedding_userAgeIds = embedding_fn['userAgeIds']
        self.embedding_userPrefectureIds =  embedding_fn['userPrefectureIds']
        self.output_dim = self.embedding_userGender.embedding_dim + self.embedding_userAgeIds.embedding_dim + self.embedding_userPrefectureIds.embedding_dim
    def forward(self, userGender, userAgeIds , userPrefectureIds):
        embedded_userGender = self.embedding_userGender(userGender).squeeze(dim=0)
        embedded_userAgeIds = self.embedding_userAgeIds(userAgeIds).squeeze(dim=0)
        embedded_userPrefectureIds = self.embedding_userPrefectureIds(userPrefectureIds).squeeze(dim=0)
        # Concatenate embeddings along the last dimension
        # print('!!!embed shape userAgeIds', embedded_userAgeIds.shape)
        return torch.cat((embedded_userGender, embedded_userAgeIds, embedded_userPrefectureIds), dim=-1)
        

In [11]:
# image user sim model
class ImageUserSimilarityModel(nn.Module):
    def __init__(self, image_feature_size, embedding_fn, hidden_size, output_size, user_n_layer=2, image_n_layer=2):
        super(ImageUserSimilarityModel, self).__init__()
        self.image_mlp = MLP(image_feature_size, hidden_size, output_size, n_layers=image_n_layer )
        self.user_embeddings = UserAttributeEmbeddings(embedding_fn)
        # Adjust the MLP input size to three times the embedding size (for three concatenated embeddings)
        self.user_mlp = MLP(self.user_embeddings.output_dim, hidden_size, output_size, n_layers=user_n_layer)

    def forward(self, image_features,  userGender, userAgeIds , userPrefectureIds):
        # Process image features, expected shape should be (bs, 768)
        image_output = self.image_mlp(image_features)

        # Process user attributes, expected shape should be (bs, 1)
        user_embedded = self.user_embeddings(userGender, userAgeIds , userPrefectureIds)
        # print('user_embed dim', user_embedded.shape)
        user_output = self.user_mlp(user_embedded).squeeze(1)
        # print('image repr dim', image_output.shape, 'user repr dim', user_output.shape)
        
        # Normalize the outputs
        # print(image_output.shape, user_output.shape)
        image_output = F.normalize(image_output, p=2, dim=1)
        user_output = F.normalize(user_output, p=2, dim=1)

        # Compute cosine similarity
        similarity = (image_output * user_output).sum(dim=1)
        return similarity


# test the model using single image and user attributes

In [12]:
## instantialize
image_user_sim_model = ImageUserSimilarityModel(image_feature_size=768, embedding_fn=embedding_fn, hidden_size=768, output_size=128)
image_user_sim_model.to(device)

def print_trainable_parameters(model):
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(f"Parameter name: {name}, Shape: {param.size()}")
    total_params = sum(p.numel() for p in image_user_sim_model.parameters() if p.requires_grad)
    print(f"Total trainable parameters: {total_params}")
    # print(model.user_embeddings.output_dim)
print_trainable_parameters(image_user_sim_model)

Parameter name: image_mlp.layers.0.weight, Shape: torch.Size([768, 768])
Parameter name: image_mlp.layers.0.bias, Shape: torch.Size([768])
Parameter name: image_mlp.layers.1.weight, Shape: torch.Size([128, 768])
Parameter name: image_mlp.layers.1.bias, Shape: torch.Size([128])
Parameter name: user_embeddings.embedding_userGender.weight, Shape: torch.Size([2, 1])
Parameter name: user_embeddings.embedding_userAgeIds.weight, Shape: torch.Size([15, 7])
Parameter name: user_embeddings.embedding_userPrefectureIds.weight, Shape: torch.Size([48, 24])
Parameter name: user_mlp.layers.0.weight, Shape: torch.Size([768, 32])
Parameter name: user_mlp.layers.0.bias, Shape: torch.Size([768])
Parameter name: user_mlp.layers.1.weight, Shape: torch.Size([128, 768])
Parameter name: user_mlp.layers.1.bias, Shape: torch.Size([128])
Total trainable parameters: 814059


In [13]:
test_image_feature = torch.load('../features_from_samples_ly/image_feature/image_embeds_A_105_0.pt')
print(test_image_feature.shape)

test_userGender = user_data_dict['userGender'][0:1]# .unsqueeze(1)
test_userAgeIds = user_data_dict['userAgeIds'][0:1]# .unsqueeze(1)
test_userPrefectureIds = user_data_dict['userPrefectureIds'][0:1]# .unsqueeze(1)
print(test_userGender, test_userAgeIds, test_userPrefectureIds)

torch.Size([1, 768])
tensor([[0]]) tensor([[9]]) tensor([[0]])


In [14]:
sim = image_user_sim_model(test_image_feature.to(device), test_userGender.to(device), test_userAgeIds.to(device), test_userPrefectureIds.to(device))
print(sim)
print(sim.shape)

tensor([-0.0766], device='cuda:0', grad_fn=<SumBackward1>)
torch.Size([1])


# construct dataset and training loop

In [15]:
class ImageUserDataset(Dataset):
    def __init__(self, image_features_A, image_features_B, user_data_dict, Q_labels):
        """
        Args:
            image_features_A or B (Tensor): image tensor representations
            user_data_dict (dict): dict for userGender, userAgeIds, userPrefectureIds
        """
        self.image_features_A = image_features_A
        self.image_features_B = image_features_B
        self.user_data_dict = user_data_dict
        self.userGender = self.user_data_dict['userGender']
        self.userAgeIds = self.user_data_dict['userAgeIds']
        self.userPrefectureIds = self.user_data_dict['userPrefectureIds']
        self.labels = Q_labels
        assert len(self.image_features_A) == len(self.userGender), 'warning: user length and image feature mismatch!'

    def __len__(self):
        return len(self.image_features_A)

    def __getitem__(self, index):
        # Fetch the data and label at the specified index
        image_feature_A = self.image_features_A[index]
        image_feature_B = self.image_features_B[index]
        userGender, userAgeIds, userPrefectureIds = self.userGender[index], self.userAgeIds[index], self.userPrefectureIds[index]
        label = self.labels[index]
        return image_feature_A, image_feature_B,  userGender, userAgeIds, userPrefectureIds, label

In [16]:
def calculate_sign_agreement_ignoring_zero(outputs, targets):
    # Identify non-zero indices in both outputs and targets
    non_zero_mask = (outputs != 0) & (targets != 0)
    
    # Filter outputs and targets to remove zeros
    filtered_outputs = outputs[non_zero_mask]
    filtered_targets = targets[non_zero_mask]

    # Calculate signs and agreement on non-zero elements only
    sign_outputs = torch.sign(filtered_outputs)
    sign_targets = torch.sign(filtered_targets)
    # agreement = sign_outputs == sign_targets
    # print(len(sign_outputs))
    return sign_outputs, sign_targets
    
# Example outputs and true labels
outputs = torch.tensor([[-3.5], [0.0], [3.0], [-1.0]])  # Shape (4, 1)
targets = torch.tensor([[-1], [1], [1], [-1]])  # Shape (4, 1)

# Calculate sign agreement accuracy
sign_outputs, sign_targets = calculate_sign_agreement_ignoring_zero(outputs, targets)
print("Sign Agreement:", sign_outputs, sign_targets)

Sign Agreement: tensor([-1.,  1., -1.]) tensor([-1,  1, -1])


In [17]:
# prepare image A B dataset
image_A_data = []
for image_id in tqdm(df_user.Afilename):
    tensor = torch.load(f'../features_from_samples_ly/image_feature/image_embeds_{image_id}_all.pt')
    image_A_data.append(tensor)
image_A_features = torch.cat(image_A_data, dim=0)
print(image_A_features.shape)

  0%|          | 0/3800 [00:00<?, ?it/s]

torch.Size([3800, 768])


In [18]:
image_B_data = []
for image_id in tqdm(df_user.Bfilename):
    tensor = torch.load(f'../features_from_samples_ly/image_feature/image_embeds_{image_id}_all.pt')
    image_B_data.append(tensor)
image_B_features = torch.cat(image_B_data, dim=0)
print(image_B_features.shape)

  0%|          | 0/3800 [00:00<?, ?it/s]

torch.Size([3800, 768])


# create image-user dataset

In [19]:
image_user_dataset = ImageUserDataset(image_A_features, image_B_features, user_data_dict, Q_labels)
# image_user_dataset[0]

In [20]:
# Split the dataset into train and validation sets
train_size = int(0.8 * len(image_user_dataset))
test_size = len(image_user_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(image_user_dataset, [train_size, test_size])
print(len(train_dataset), len(test_dataset))

3040 760


# training loop!!

In [25]:
batch_size = 128
n_epoch = 500
lr = 5e-6

hidden_size = 32
output_size = 32

## instantialize
image_user_sim_model = ImageUserSimilarityModel(image_feature_size=768, 
                                                embedding_fn=embedding_fn, 
                                                hidden_size=hidden_size, 
                                                output_size=output_size,
                                                image_n_layer=4,
                                                user_n_layer=4
                                               )
image_user_sim_model.to(device)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(image_user_sim_model.parameters(), lr=lr)

writer = SummaryWriter('runs/my_experiment')
print_trainable_parameters(image_user_sim_model)
# dataset output: image_feature_A, image_feature_B,  userGender, userAgeIds, userPrefectureIds, label
from sklearn.metrics import f1_score

def train_model(model, train_loader, validation_loader, criterion, optimizer, epochs):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        predictions = []
        true_labels = []
        for i, (batch_imageA_features, batch_imageB_features, batch_userGender, batch_userAgeIds, batch_userPrefectureIds, batch_labels) in enumerate(train_loader, 0):
            optimizer.zero_grad()
            # print(batch_imageA_features.shape, batch_userGender.shape)
            # forward + backward + optimize
            sim_A = model(
                batch_imageA_features.to(device), 
                batch_userGender.to(device), batch_userAgeIds.to(device), batch_userPrefectureIds.to(device)
            )
            sim_B = model(
                batch_imageB_features.to(device), 
                batch_userGender.to(device), batch_userAgeIds.to(device), batch_userPrefectureIds.to(device)
            )
            
            # print('sim_A', sim_A, sim_A.shape)
            batch_score_diff = sim_B - sim_A # -1 prefer A, +1 prefer B, so score > 0 means B better, diff = B -A 
            # print(batch_score_diff.shape)
            batch_score_diff = batch_score_diff.squeeze()
            # print(batch_score_diff.dtype, batch_labels.dtype)
            loss = criterion(batch_score_diff, batch_labels.float().to(device))
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            sign_outputs, sign_targets = calculate_sign_agreement_ignoring_zero(batch_score_diff.detach().cpu(), batch_labels)

            predictions.extend(sign_outputs.detach().cpu().numpy())    
            true_labels.extend(sign_targets.detach().cpu().numpy())
            
            total += sign_targets.size(0)
            correct += (sign_outputs == sign_targets).sum().item()

        train_accuracy = 100 * correct / total
        predictions = np.array(predictions)
        true_labels = np.array(true_labels)
        train_f1_score = f1_score(true_labels, predictions, average='macro') * 100
        train_loss = running_loss / len(train_loader)
        # train_accuracy = 100 * correct / total

        # Log training loss and accuracy
        writer.add_scalar('Loss/train', train_loss, epoch)
        writer.add_scalar('Accuracy/train', train_accuracy, epoch)
        writer.add_scalar('F1-Score/train', train_f1_score, epoch)
        # Validation loss
        model.eval()
        running_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            predictions = []
            true_labels = []
            for i, (batch_imageA_features, batch_imageB_features, batch_userGender, batch_userAgeIds, batch_userPrefectureIds, batch_labels) in enumerate(validation_loader, 0):
                sim_A = model(
                    batch_imageA_features.to(device), 
                    batch_userGender.to(device), batch_userAgeIds.to(device), batch_userPrefectureIds.to(device)
                )
                sim_B = model(
                    batch_imageB_features.to(device), 
                    batch_userGender.to(device), batch_userAgeIds.to(device), batch_userPrefectureIds.to(device)
                )
                batch_score_diff = sim_B - sim_A # -1 prefer A, +1 prefer B, so score > 0 means B better, diff = B -A 
                batch_score_diff = batch_score_diff.squeeze()
                loss = criterion(batch_score_diff, batch_labels.float().to(device))
                running_loss += loss.item()
                
                
                sign_outputs, sign_targets = calculate_sign_agreement_ignoring_zero(batch_score_diff.detach().cpu(), batch_labels)

                predictions.extend(sign_outputs.detach().cpu().numpy())    
                true_labels.extend(sign_targets.detach().cpu().numpy())
                # target = categorize_scores(batch_labels)
                total += sign_targets.size(0)
                correct += (sign_outputs == sign_targets).sum().item()

        valid_loss = running_loss / len(validation_loader)
        valid_accuracy = 100 * correct / total
        predictions = np.array(predictions)
        true_labels = np.array(true_labels)
        val_f1_score = f1_score(true_labels, predictions, average='macro') * 100

        # Log validation loss and accuracy
        writer.add_scalar('Loss/validation', valid_loss, epoch)
        writer.add_scalar('Accuracy/validation', valid_accuracy, epoch)
        writer.add_scalar('F1-Score/validation', val_f1_score, epoch)

        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, train acc: {train_accuracy:.4f}, train F1: {train_f1_score:.4f}', 
              f'Val Loss: {valid_loss:.4f}, valid acc: {valid_accuracy:.4f}, valid F1: {val_f1_score:.4f}')

    print('Finished Training')
    writer.close()

# Assuming train_loader and validation_loader are defined
train_model(image_user_sim_model, train_loader, test_loader, criterion, optimizer, epochs=n_epoch)


Parameter name: image_mlp.layers.0.weight, Shape: torch.Size([32, 768])
Parameter name: image_mlp.layers.0.bias, Shape: torch.Size([32])
Parameter name: image_mlp.layers.1.weight, Shape: torch.Size([32, 32])
Parameter name: image_mlp.layers.1.bias, Shape: torch.Size([32])
Parameter name: image_mlp.layers.2.weight, Shape: torch.Size([32, 32])
Parameter name: image_mlp.layers.2.bias, Shape: torch.Size([32])
Parameter name: image_mlp.layers.3.weight, Shape: torch.Size([32, 32])
Parameter name: image_mlp.layers.3.bias, Shape: torch.Size([32])
Parameter name: user_embeddings.embedding_userGender.weight, Shape: torch.Size([2, 1])
Parameter name: user_embeddings.embedding_userAgeIds.weight, Shape: torch.Size([15, 7])
Parameter name: user_embeddings.embedding_userPrefectureIds.weight, Shape: torch.Size([48, 24])
Parameter name: user_mlp.layers.0.weight, Shape: torch.Size([32, 32])
Parameter name: user_mlp.layers.0.bias, Shape: torch.Size([32])
Parameter name: user_mlp.layers.1.weight, Shape: t

KeyboardInterrupt: 

In [None]:
#save model weights
torch.save(image_user_sim_model.state_dict(), f'save/image_user_sim_model_bs{batch_size}_ep{n_epoch}_lr{lr}_hs{hidden_size}_os{output_size}.pth')


# test model performance

In [105]:
i = 3100
print(df_user.iloc[i, :])
image_id_A = df_user.iloc[i, :].Afilename
image_id_B = df_user.iloc[i, :].Bfilename
user_id = i
test_image_feature_A = torch.load(f'../features_from_samples_ly/image_feature/image_embeds_{image_id_A}_all.pt')
test_image_feature_B = torch.load(f'../features_from_samples_ly/image_feature/image_embeds_{image_id_B}_all.pt')

print(test_image_feature.shape)

test_userGender = user_data_dict['userGender'][user_id:user_id+1]# .unsqueeze(1)
test_userAgeIds = user_data_dict['userAgeIds'][user_id:user_id+1]# .unsqueeze(1)
test_userPrefectureIds = user_data_dict['userPrefectureIds'][user_id:user_id+1]# .unsqueeze(1)
print(test_userGender, test_userAgeIds, test_userPrefectureIds)

sim_A = image_user_sim_model(test_image_feature_A.to(device), test_userGender.to(device), test_userAgeIds.to(device), test_userPrefectureIds.to(device))
sim_B = image_user_sim_model(test_image_feature_B.to(device), test_userGender.to(device), test_userAgeIds.to(device), test_userPrefectureIds.to(device))
Q_pred = (sim_B.item() - sim_A.item())
print(sim_A.item(), sim_B.item(), Q_pred, )

Afilename            D_60
Bfilename            C_26
userGender              0
userAgeIds             10
userPrefectureIds      19
userAddressIds        711
Q1                    0.0
Q_label                 2
Name: 3100, dtype: object
torch.Size([1, 768])
tensor([[0]]) tensor([[10]]) tensor([[19]])
-0.42924386262893677 -0.05177712440490723 0.37746673822402954
