In [70]:
from transformers import AutoTokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling  
from transformers import Trainer, TrainingArguments 
import torch  
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer 
import torch.nn.functional as F 
import os
#os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
from torch import nn
from math import sqrt

In [71]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split

# Set seed for reproducibility
np.random.seed(0)

# Create the list of pins, users and actions
pins = list(range(1,1001))
users = list(range(1,101))
actions = ['click', 'closeup', 'save']

# Creating empty DataFrame
df = pd.DataFrame(columns=['user', 'pin', 'action', 'timestamp'])

# Populating the DataFrame
for user in users:
    num_pins = np.random.randint(3, 21)  # user engages with min 3 pins and max 20 pins
    engaged_pins = np.random.choice(pins, num_pins, replace=False)  # engaged pins for this user
    engaged_actions = np.random.choice(actions, num_pins)  # actions for this user
    timestamps = [datetime.now() - timedelta(days=x) for x in range(num_pins)]  # random timestamps for user engagement
    
    temp_df = pd.DataFrame({
        'user': user,
        'pin': engaged_pins,
        'action': engaged_actions,
        'timestamp': timestamps
    })
    
    df = pd.concat([df, temp_df])


In [72]:
df.head()

Unnamed: 0,user,pin,action,timestamp
0,1,709,click,2023-07-25 22:27:12.452102
1,1,534,click,2023-07-24 22:27:12.452679
2,1,299,click,2023-07-23 22:27:12.452684
3,1,357,click,2023-07-22 22:27:12.452686
4,1,834,closeup,2023-07-21 22:27:12.452688


In [73]:
#map actions to integers
action_to_int = {'click': 0, 'closeup': 1, 'save': 2}
df['action'] = df['action'].map(action_to_int)


In [74]:
# Split users into train and test users
train_users, test_users = train_test_split(users, test_size=0.2)

train_df = df[df['user'].isin(train_users)]
test_df = df[df['user'].isin(test_users)]


In [75]:
#find the max and min timestamp in train_df
#cutoff_times is 2 weeks prior to the max timestamp
max_timestamp = train_df['timestamp'].max()
min_timestamp = train_df['timestamp'].min()
cut_off_train = max_timestamp - timedelta(days=14)

# Create X_train and y_train
X_train = train_df[train_df['timestamp'] > cut_off_train]
y_train = train_df[(train_df['timestamp'] <= cut_off_train)]

# Create X_test and y_test
X_test = test_df[test_df['timestamp'] > cut_off_train]
y_test = test_df[(test_df['timestamp'] <= cut_off_train)]


In [76]:
y_train.head()

Unnamed: 0,user,pin,action,timestamp
14,1,863,1,2023-07-11 22:27:12.452703
14,2,665,0,2023-07-11 22:27:12.458357
15,2,781,1,2023-07-10 22:27:12.458358
16,2,353,0,2023-07-09 22:27:12.458360
14,4,272,1,2023-07-11 22:27:12.461942


In [77]:


#if the list is empty add 5 0's or list is less then length 5 add 0 to make it 5 and it if greater then 5 trim it

def pad_or_trim(lst):
    if len(lst) == 0:
        return [0,0,0,0,0]
    elif len(lst) < 5:
        return [0]*(5-len(lst)) + lst
    else:
        return lst[:5]

X_train = X_train.sort_values(by=['timestamp'])
X_train = X_train.groupby('user').agg({'pin': list, 'action': list,'timestamp': list}).reset_index()
X_train['pin'] = X_train['pin'].apply(lambda x: pad_or_trim(x))
X_train['action'] = X_train['action'].apply(pad_or_trim)
X_train['timestamp'] = X_train['timestamp'].apply(pad_or_trim)

y_train = y_train.sort_values(by=['timestamp'])
y_train = y_train.groupby('user').agg({'pin': list, 'action': list,'timestamp': list}).reset_index()
y_train['pin'] = y_train['pin'].apply(pad_or_trim)
y_train['action'] = y_train['action'].apply(pad_or_trim)
y_train['timestamp'] = y_train['timestamp'].apply(pad_or_trim)


X_test = X_test.sort_values(by=['timestamp'])   
X_test = X_test.groupby('user').agg({'pin': list, 'action': list,'timestamp': list}).reset_index()
X_test['pin'] = X_test['pin'].apply(pad_or_trim)
X_test['action'] = X_test['action'].apply(pad_or_trim)
X_test['timestamp'] = X_test['timestamp'].apply(pad_or_trim)

y_test = y_test.sort_values(by=['timestamp'])
y_test = y_test.groupby('user').agg({'pin': list, 'action': list,'timestamp': list}).reset_index()
y_test['pin'] = y_test['pin'].apply(pad_or_trim)
y_test['action'] = y_test['action'].apply(pad_or_trim)
y_test['timestamp'] = y_test['timestamp'].apply(pad_or_trim)



In [78]:
X_train[X_train['user'] == 2]

Unnamed: 0,user,pin,action,timestamp
1,2,"[532, 164, 688, 779, 19]","[2, 2, 0, 1, 0]","[2023-07-12 22:27:12.458356, 2023-07-13 22:27:..."


In [79]:
y_train[y_train['user'] == 2]

Unnamed: 0,user,pin,action,timestamp
1,2,"[0, 0, 353, 781, 665]","[0, 0, 0, 1, 0]","[0, 0, 2023-07-09 22:27:12.458360, 2023-07-10 ..."


In [121]:
#DEFINE ALL THE PARAMETERS FOR THE MODEL
hidden_size = 12 #hidden size of transformer and the final output coming from the transformer
pin_embedding_dim = 12 #dimension of the pin embedding
action_embedding_dim = 3 #dimension of the action embedding
num_attention_heads = 12 #number of attention heads in transformer that is concatenated together to form the final attention head of dimension 768
pins_vocab = 1001 #number of pins
actions_vocab = 3 #number of actions
num_layers = 12 #number of transformer layers in the model which is a replication of the same transformer layer whose input is the output of the previous layer and the output is the input to the next layer
dropout = 0.1
max_length = 5 #maximum length of the input sequence
batch_size = 8 #batch size for training
epochs = 1
learning_rate = 0.0001

In [333]:
class Embeddings(nn.Module):
  """
  Creates a single Dense Embedding for each token --> Token Embedding + Positional Embedding
  """
  def __init__(self,pins_vocab,pin_embedding_dim,actions_vocab, action_embedding_dim, hidden_size):
    super().__init__()
    self.pin_embedding = nn.Embedding(pins_vocab, pin_embedding_dim)
    self.action_type_embedding = nn.Embedding(actions_vocab, action_embedding_dim)
    self.linear = nn.Linear(pin_embedding_dim+action_embedding_dim, hidden_size)
    self.position_embedding = nn.Embedding(5, hidden_size)
    self.layer_norm = nn.LayerNorm(hidden_size, eps= 1e-12)
    self.dropout = nn.Dropout()

  def forward(self,pin_ids,action_ids):
    # pin_ids = [batch_size, seq_len]
    # action_ids = [batch_size, seq_len]
    # position_ids = [batch_size, seq_len]
    batch_size, seq_len = pin_ids.shape
    position_ids = torch.arange(seq_len, dtype=torch.long).expand((batch_size, seq_len))
    # position_ids = [batch_size, seq_len]
    pin_embeddings = self.pin_embedding(pin_ids)
    # pin_embeddings = [batch_size, seq_len, hidden_size]
    action_embeddings = self.action_type_embedding(action_ids)
    # action_embeddings = [batch_size, seq_len, hidden_size]
    position_embeddings = self.position_embedding(position_ids)
    #print(position_embeddings.shape)

    #concatenate all the pin and action embeddings
    embeddings = torch.cat([pin_embeddings, action_embeddings], dim=-1)
    #apply a linear layer to get the embeddings to the desired hidden size

    embeddings = self.linear(embeddings)
    #print(embeddings.shape)

    #add the position embeddings
    embeddings = embeddings + position_embeddings

    #embeddings = pin_embeddings + action_embeddings + position_embeddings
    # embeddings = [batch_size, seq_len, hidden_size]
    embeddings = self.layer_norm(embeddings)
    embeddings = self.dropout(embeddings)
    return embeddings
    

In [123]:
position_ids = torch.arange(5, dtype=torch.long).expand((8, 5))
embedded =  nn.Embedding(5, 3)
position_embeddings = embedded(position_ids)
position_embeddings.shape

torch.Size([8, 5, 3])

In [124]:
embedded = Embeddings(1000,256,3,3,12)

#sample embeddings

pin_ids = torch.tensor([[1,2,3,4,5],[1,2,3,4,5]])
print(pin_ids.shape)
action_ids = torch.tensor([[1,2,0,2,1],[1,2,0,2,1]])
embedded(pin_ids,action_ids)





torch.Size([2, 5])
torch.Size([2, 5, 12])
torch.Size([2, 5, 12])


tensor([[[ 0.0000, -0.0000, -0.0000,  0.0000, -1.9729,  1.9645,  0.7143,
          -1.0371, -0.0000, -1.7302,  3.0014,  2.3532],
         [ 2.3744, -5.0341,  0.0000,  1.6825, -0.7166,  0.0000, -1.3007,
           0.0000,  0.1960, -2.4292,  0.7411,  0.8138],
         [ 0.0000, -0.5715,  0.0000, -3.3566,  0.0000,  0.0000, -0.0000,
          -0.0000,  0.0000, -0.0000,  0.0000,  1.5766],
         [ 0.6187, -0.0000, -0.0000,  0.0000,  0.7198,  0.0000, -0.0000,
           2.2676,  2.7325, -0.7301, -0.0492, -2.4983],
         [ 0.0000, -0.7059,  0.0000, -0.0000,  0.0000,  1.4348, -0.0000,
           0.0000, -0.0000, -0.0000,  1.0154,  1.0361]],

        [[ 0.4384, -2.0169, -0.0000,  0.0000, -1.9729,  0.0000,  0.7143,
          -1.0371, -2.6834, -1.7302,  0.0000,  0.0000],
         [ 0.0000, -0.0000,  1.4207,  1.6825, -0.7166,  0.0000, -1.3007,
           0.0000,  0.0000, -0.0000,  0.7411,  0.8138],
         [ 1.6773, -0.0000,  1.8410, -3.3566,  0.0000,  0.6709, -0.0000,
          -0.0000,  0.

In [125]:
class pinformer_dataset(Dataset):
  """
  Creates a dataset for the pinformer model
  """
  def __init__(self,df):
    self.df = df

  def __len__(self):
    return len(self.df)

  def __getitem__(self,idx):
    pin_ids = torch.tensor(self.df.iloc[idx]['pin'])
    action_ids = torch.tensor(self.df.iloc[idx]['action'])

    target_pin_ids = torch.tensor(self.df.iloc[idx]['pin']) 
    #pick one random non-zero pin from the list of pins and make it the target pin
    target_pin_ids = target_pin_ids[target_pin_ids != 0]
    target_pin_ids = target_pin_ids[torch.randperm(len(target_pin_ids))]
    target_pin_ids = target_pin_ids[0]

    return pin_ids,action_ids, target_pin_ids

train_df = pinformer_dataset(X_train)
test_df = pinformer_dataset(X_test)

train_data_loader = DataLoader(train_df, batch_size=8, shuffle=False) #shuffle is false because we want to preserve the order of the sequence so user 1 embeddings are in sequence and user 2 embeddings are in sequence and so on and can be pulled 
test_data_loader = DataLoader(test_df, batch_size=8, shuffle=False)

In [332]:
from torch import nn
class AttentionHead(nn.Module):
  def __init__(self, embed_dim, head_dim):
    super().__init__()
    self.head_dim = head_dim #dimension of one head 
    #infeatures=embed_dim
    #outfeatures=head_dim
    self.q = nn.Linear(embed_dim, head_dim)
    self.k = nn.Linear(embed_dim, head_dim)
    self.v = nn.Linear(embed_dim, head_dim)
    
  
  def causal_mask(self,batch_size,size, dtype):  
    mask = torch.tril(torch.ones(size,size)).unsqueeze(0)
    return mask
    
  
      
  def scaled_dot_product_attention(self,query, key, value):
    dim_k = query.size(dim=-1)  
    #print(dim_k)    
    #print(f'Dimension of the q,k,v Matrix [Batch_size, seq_len, Head_dim] of One Head {dim_k}')
    scores = torch.bmm(query,key.transpose(1,2))/ sqrt(dim_k)  #[(1,5,768)*(1,768,5)]/sqrt(768) >>> [batch_size,5,5] 
    
    mask = self.causal_mask(scores.size(0),scores.size(1),dtype=torch.int32)
    #print(mask)
    scores = scores.masked_fill(mask==0, float(0)) 
    weights = F.softmax(scores, dim=-1) #[batch_size,5,5]
    #print(weights)
    #print(f'Softmax for each column across one row {weights}')
    weights_dot_values = torch.bmm(weights,value)  #[batch_size,5,5]*[batch_size,5,64] >>> [batch_size,5,64]
    #print(f'Last Step is to multiply weights and values {weights_dot_values.shape}')
    return weights_dot_values 

  def forward(self, hidden_state):
    #head_state = [batch_size, seq_len, embed_dim]
    #print(f'Input Embedding for Each Token with X Matrix {hidden_state.size()}')
    #q = X*W_q
    q = self.q(hidden_state) #q = [batch_size, seq_len, head_dim]
    #print(f'Shape of the Query Matrix W_q {q.size()}')
    k = self.k(hidden_state) #k = [batch_size, seq_len, head_dim]
    #print(f'Shape of the Key Matrix W_k {k.size()}')
    v = self.k(hidden_state) #v = [batch_size, seq_len, head_dim]
    #print(f'Shape of the Value Matrix W_k {v.size()}')
    #print('-----------------Calculating Self Attention--------------------')
    attn_outputs = self.scaled_dot_product_attention(q,k,v) #attn_outputs = [batch_size, seq_len, head_dim]
    #print(f'Shape of the attention Output with one Head and Head Dimension {self.head_dim} is {attn_outputs.size()}')
    return attn_outputs

In [127]:
#create a single attention head
attention_head = AttentionHead(hidden_size,12)

#sample input first calculate the embeddings for the input
pin_ids = torch.tensor([[1,2,3,4,5],[1,2,3,4,5]])
print(pin_ids.shape)
action_ids = torch.tensor([[1,2,0,2,1],[1,2,0,2,1]])
embedded(pin_ids,action_ids)

#pass the embeddings to the attention head
attention_head(embedded(pin_ids,action_ids))


torch.Size([2, 5])
torch.Size([2, 5, 12])
torch.Size([2, 5, 12])
torch.Size([2, 5, 12])
torch.Size([2, 5, 12])
Softmax for each column across one row tensor([[[0.1813, 0.2047, 0.2047, 0.2047, 0.2047],
         [0.1159, 0.1269, 0.2524, 0.2524, 0.2524],
         [0.2127, 0.2061, 0.1703, 0.2055, 0.2055],
         [0.2413, 0.1478, 0.2441, 0.0915, 0.2753],
         [0.0726, 0.1731, 0.1666, 0.3286, 0.2592]],

        [[0.0613, 0.2347, 0.2347, 0.2347, 0.2347],
         [0.2511, 0.1284, 0.2068, 0.2068, 0.2068],
         [0.1504, 0.1901, 0.1602, 0.2497, 0.2497],
         [0.3503, 0.2598, 0.0594, 0.0975, 0.2329],
         [0.2109, 0.1663, 0.1790, 0.1324, 0.3115]]],
       grad_fn=<SoftmaxBackward0>)


tensor([[[-0.2134,  0.5469, -0.6747,  0.0682,  0.3882,  0.7637, -0.4783,
           0.1805,  0.0934, -1.1307,  0.9239,  0.1898],
         [-0.1293,  0.4728, -0.6514,  0.3049,  0.2166,  0.6398, -0.6145,
           0.0852,  0.2100, -1.0837,  0.8449,  0.1860],
         [-0.2304,  0.5642, -0.6642,  0.0091,  0.4302,  0.7815, -0.4727,
           0.1852,  0.0538, -1.1268,  0.9252,  0.1836],
         [-0.1992,  0.2660, -0.7336, -0.1756,  0.3842,  0.7326, -0.3967,
           0.1669,  0.3257, -1.0714,  0.5576,  0.0425],
         [-0.0995,  0.6450, -0.6679,  0.4365,  0.2551,  0.6681, -0.7024,
           0.0183,  0.1056, -1.1319,  1.0081,  0.2581]],

        [[ 0.2846,  0.3132, -0.2668,  0.6293,  0.4161,  0.3875, -0.3811,
           0.0883,  0.3151, -0.6666,  0.0197,  0.5414],
         [ 0.4155,  0.2769, -0.4094,  0.5609,  0.6215,  0.5521, -0.1947,
           0.2095,  0.2913, -0.5987, -0.0062,  0.5744],
         [ 0.3952,  0.2424, -0.3790,  0.5326,  0.5539,  0.4447, -0.3030,
           0.1818,  0.

In [128]:
class MultiHeadAttention(nn.Module):
  def __init__(self, hidden_size, num_attention_heads):
    super().__init__()
    embed_dim = hidden_size
    num_heads = num_attention_heads
    head_dim = embed_dim // num_heads
    self.heads = [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)] #initializing all the heads
    self.w_0 = nn.Linear(embed_dim,embed_dim) #the purpose of this linear layer is to concatenate all the heads together to form the final output of the multihead attention

  def forward(self,hidden_state):
    '''
    hidden_state: Input Embedding with dimensions [batch_size, seq_len, embedding_dimension]
    '''
    attention_outputs = [head(hidden_state) for head in self.heads] #Calculating Self-Attention on each head
    contcat_attn_outputs_allheads = torch.cat(attention_outputs, dim=-1) #[batch_size,seq_len, embed_dim]
    Z =   self.w_0(contcat_attn_outputs_allheads) #[batch_size, seq_len, embed_dim]
    return Z

In [129]:
class FeedForward(nn.Module):
  def __init__(self,hidden_size):
    super().__init__()
    self.linear1 = nn.Linear(hidden_size, 3072)
    self.linear2 = nn.Linear(3072, hidden_size)
    self.gelu = nn.GELU()
    self.dropout = nn.Dropout(0.1)
  
  def forward(self, attention_outputs):
    output_l1 = self.linear1(attention_outputs)
    activated_outputs = self.gelu(output_l1)
    output_l2 = self.linear2(activated_outputs)
    output = self.dropout(output_l2)
    return output

In [130]:
class TransformerDecoderLayer(nn.Module):
  def __init__(self, hidden_size, num_attention_heads):
    super(TransformerDecoderLayer,self).__init__()
    self.layer_norm1 = nn.LayerNorm(hidden_size)
    self.layer_norm2 = nn.LayerNorm(hidden_size)
    self.multi_attention = MultiHeadAttention(hidden_size, num_attention_heads)
    self.feedforward = FeedForward(hidden_size)

  def forward(self, input_embeddings):
     #pre-layer normalization approach
     
     #Step 1: Applying Layer Normalization to Input Embeddings
     normalized_input_embeddings = self.layer_norm1(input_embeddings)
     
     #Step 2: Applying MultiHeadAttention to Normalized Output
     multi_head_attn = self.multi_attention(normalized_input_embeddings)
     
     #Step 3: Add input embeddings to the Multihead Attention Output
     skip_connection_1 = input_embeddings + multi_head_attn

     #step 4: Pass the output to another Layer Normalization 
     layer_norm_2 = self.layer_norm2(skip_connection_1)

     #Step 5: Adding skip connection 1 outputs to the output of the FeedForward Network (applied on Step 4)
     skip_connection_2 = skip_connection_1 + self.feedforward(layer_norm_2)
     #print(f'output of MultiHeadAttention and FeedForward Network is {skip_connection_2.shape}')
     return skip_connection_2

In [131]:
class TransferDecoder(nn.Module):
  def __init__(self,num_attention_heads,num_layers, pins_vocab,actions_vocab, hidden_size):
    super().__init__()
    self.embedding = Embeddings(pins_vocab, pin_embedding_dim, actions_vocab, action_embedding_dim, hidden_size)
    self.layers = nn.ModuleList([TransformerDecoderLayer(hidden_size, num_attention_heads) for _ in range(num_layers)]) 
                                
  def forward(self, pin_ids, action_ids):
    embeddings = self.embedding(pin_ids, action_ids) #in: [batch_size, seq_len] out: [batch_size, seq_len, hidden_size]
    for layer in self.layers:
      embeddings = layer(embeddings)
    return embeddings


In [132]:
X_train.iloc[0]['pin']

[240, 986, 1000, 675, 995]

In [133]:
embedding_decoder = TransferDecoder(num_attention_heads,num_layers, pins_vocab,actions_vocab, hidden_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#need to pull embeddings for each user and then store it in a dictionary
user1_pins = torch.tensor(X_train.iloc[0]['pin']).unsqueeze(0) #unserequeeze is used to add a dimension to the tensor for batch size dimension is [1,5] for batch size 1 and sequence length 5
user1_actions = torch.tensor(X_train.iloc[0]['action']).unsqueeze(0) #unserequeeze is used to add a dimension to the tensor for batch size dimension is [1,5] for batch size 1 and sequence length 5





In [134]:
user1_embeddings = embedding_decoder(user1_pins,user1_actions)
user1_embeddings.shape

torch.Size([1, 5, 12])
torch.Size([1, 5, 12])
Softmax for each column across one row tensor([[[0.2023, 0.1994, 0.1994, 0.1994, 0.1994],
         [0.1910, 0.2268, 0.1941, 0.1941, 0.1941],
         [0.1998, 0.1550, 0.2549, 0.1952, 0.1952],
         [0.1958, 0.2675, 0.1451, 0.1901, 0.2015],
         [0.1964, 0.1817, 0.2117, 0.1979, 0.2123]]],
       grad_fn=<SoftmaxBackward0>)
Softmax for each column across one row tensor([[[0.1971, 0.2007, 0.2007, 0.2007, 0.2007],
         [0.2055, 0.1762, 0.2061, 0.2061, 0.2061],
         [0.1864, 0.2475, 0.1953, 0.1854, 0.1854],
         [0.1837, 0.2518, 0.1935, 0.1883, 0.1826],
         [0.1739, 0.3043, 0.1906, 0.1816, 0.1495]]],
       grad_fn=<SoftmaxBackward0>)
Softmax for each column across one row tensor([[[0.2004, 0.1999, 0.1999, 0.1999, 0.1999],
         [0.2286, 0.1693, 0.2007, 0.2007, 0.2007],
         [0.1827, 0.2035, 0.2311, 0.1914, 0.1914],
         [0.2396, 0.2119, 0.1833, 0.1379, 0.2272],
         [0.1493, 0.1759, 0.2135, 0.3123, 0.1490]

torch.Size([1, 5, 12])

In [135]:
#get embeddings for the first batch
for batch in train_data_loader:
    pin_ids, action_ids, target_pin_ids = batch
    print(pin_ids.shape)
    print(action_ids.shape)
    print(target_pin_ids.shape)
    print(target_pin_ids)
    embedding_decoder(pin_ids,action_ids)
    break
  

torch.Size([8, 5])
torch.Size([8, 5])
torch.Size([8])
tensor([1000,  688,  125,  739,  381,  515,  828,  598])
torch.Size([8, 5, 12])
torch.Size([8, 5, 12])
Softmax for each column across one row tensor([[[0.1848, 0.2038, 0.2038, 0.2038, 0.2038],
         [0.2570, 0.1473, 0.1986, 0.1986, 0.1986],
         [0.1860, 0.2087, 0.2129, 0.1962, 0.1962],
         [0.2075, 0.1960, 0.1940, 0.2004, 0.2021],
         [0.2085, 0.1965, 0.1945, 0.2011, 0.1993]],

        [[0.1694, 0.2076, 0.2076, 0.2076, 0.2076],
         [0.1420, 0.2926, 0.1885, 0.1885, 0.1885],
         [0.2541, 0.0973, 0.2997, 0.1745, 0.1745],
         [0.2331, 0.1125, 0.2642, 0.2149, 0.1753],
         [0.1745, 0.2875, 0.1601, 0.1844, 0.1935]],

        [[0.2049, 0.1988, 0.1988, 0.1988, 0.1988],
         [0.1978, 0.2022, 0.2000, 0.2000, 0.2000],
         [0.2143, 0.1655, 0.2437, 0.1882, 0.1882],
         [0.2330, 0.0952, 0.3633, 0.1598, 0.1487],
         [0.1698, 0.3160, 0.1248, 0.2206, 0.1688]],

        [[0.1471, 0.2132, 0.2132,

In [136]:
df.head()

Unnamed: 0,user,pin,action,timestamp
0,1,709,0,2023-07-25 22:27:12.452102
1,1,534,0,2023-07-24 22:27:12.452679
2,1,299,0,2023-07-23 22:27:12.452684
3,1,357,0,2023-07-22 22:27:12.452686
4,1,834,1,2023-07-21 22:27:12.452688


In [174]:
class RecommenderDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user = self.df.iloc[idx]['user']
        pin_ids = torch.tensor(self.df.iloc[idx]['pin_train'])
        action_ids = torch.tensor(self.df.iloc[idx]['action_train'])
        target_pin_ids = torch.tensor(self.df.iloc[idx]['pin_target'])
        target_action_ids = torch.tensor(self.df.iloc[idx]['action_target'])
        return user, pin_ids, action_ids, target_pin_ids, target_action_ids
       
      


In [175]:
X_train.head()

Unnamed: 0,user,pin,action,timestamp
0,1,"[240, 986, 1000, 675, 995]","[1, 0, 1, 2, 2]","[2023-07-12 22:27:12.452702, 2023-07-13 22:27:..."
1,2,"[532, 164, 688, 779, 19]","[2, 2, 0, 1, 0]","[2023-07-12 22:27:12.458356, 2023-07-13 22:27:..."
2,3,"[994, 180, 298, 761, 125]","[1, 0, 2, 0, 0]","[2023-07-16 22:27:12.460196, 2023-07-17 22:27:..."
3,4,"[631, 739, 799, 345, 886]","[2, 0, 2, 1, 2]","[2023-07-12 22:27:12.461940, 2023-07-13 22:27:..."
4,5,"[440, 333, 37, 381, 461]","[0, 2, 1, 0, 1]","[2023-07-17 22:27:12.463753, 2023-07-18 22:27:..."


In [176]:
merged_train = X_train.merge(y_train, on='user', how='inner')
merged_test = X_test.merge(y_test, on='user', how='inner')

In [177]:
merged_train.rename(columns={'pin_x': 'pin_train', 'action_x': 'action_train', 'timestamp_x': 'timestamp_train', 'pin_y':'pin_target', 'action_y': 'action_target', 'timestamp_y': 'timestamp_target'}, inplace=True)

In [178]:
merged_train.head()

Unnamed: 0,user,pin_train,action_train,timestamp_train,pin_target,action_target,timestamp_target
0,1,"[240, 986, 1000, 675, 995]","[1, 0, 1, 2, 2]","[2023-07-12 22:27:12.452702, 2023-07-13 22:27:...","[0, 0, 0, 0, 863]","[0, 0, 0, 0, 1]","[0, 0, 0, 0, 2023-07-11 22:27:12.452703]"
1,2,"[532, 164, 688, 779, 19]","[2, 2, 0, 1, 0]","[2023-07-12 22:27:12.458356, 2023-07-13 22:27:...","[0, 0, 353, 781, 665]","[0, 0, 0, 1, 0]","[0, 0, 2023-07-09 22:27:12.458360, 2023-07-10 ..."
2,4,"[631, 739, 799, 345, 886]","[2, 0, 2, 1, 2]","[2023-07-12 22:27:12.461940, 2023-07-13 22:27:...","[0, 0, 989, 979, 272]","[0, 0, 0, 2, 1]","[0, 0, 2023-07-09 22:27:12.461944, 2023-07-10 ..."
3,17,"[706, 443, 386, 104, 939]","[0, 1, 2, 2, 2]","[2023-07-12 22:27:12.484793, 2023-07-13 22:27:...","[0, 0, 0, 27, 511]","[0, 0, 0, 0, 1]","[0, 0, 0, 2023-07-10 22:27:12.484796, 2023-07-..."
4,20,"[559, 987, 673, 126, 355]","[1, 0, 2, 0, 2]","[2023-07-12 22:27:12.489565, 2023-07-13 22:27:...","[651, 124, 476, 866, 443]","[2, 0, 2, 1, 0]","[2023-07-07 22:27:12.489573, 2023-07-08 22:27:..."


In [302]:
train_dataset = RecommenderDataset(merged_train)
test_dataset = RecommenderDataset(merged_test)

train_data_loader = DataLoader(train_dataset, batch_size=8, shuffle=False)
test_data_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [247]:
#get embeddings for the first batch
for batch in train_data_loader:

    pin_embeddings = nn.Embedding(pins_vocab, pin_embedding_dim)
    user, pin_ids, action_ids, target_pin_ids, target_action_ids = batch
    #print(user)
    #print(pin_ids.shape)
    #print(action_ids.shape)
    print(target_pin_ids.shape)
    #print(target_pin_ids)

    #user_embeddings = embedding_decoder(pin_ids,action_ids)
    #print(user_embeddings.shape)

    #for each user select the last embedding
    #user_embeddings = user_embeddings[:,-1,:]
    # Create a mask of non-zero elements
    mask = target_pin_ids != 0 #shape [batch_size, seq_len]
    # Get the indices of non-zero elements
    indices = mask.nonzero() #shape [num_non_zero_elements, 2]

    #print(mask)
    #print(indices.shape)
    #print(indices)  
    # Get the unique row indices and the counts of non-zero elements in each row
    unique_rows, counts = indices[:, 0].unique(return_counts=True) #shape [num_unique_rows, 1] #this is pulling the unique row indices and the counts of non-zero elements in each row
    print(indices[:,0])
    #print(unique_rows.size(0))
    print(counts)
    print(counts.cumsum(0)[:-1])
    ## Get the starting indices of non-zero elements in each row in the 'indices' tensor
    starts = torch.cat((torch.tensor([0]), counts.cumsum(0)[:-1]))
#
    ## Generate a random number for each row in the range of the number of non-zero elements in the row
    rand_num = torch.randint(counts.min(), size=(unique_rows.size(0),)).to(target_pin_ids.device)
    print(rand_num)
    ## Add the starting indices to the random numbers to get the indices of the selected elements in the 'indices' tensor
    selected_indices = starts + rand_num
    print(selected_indices)
    ## Get the indices of the selected elements in the 'input_tensor'
    selected_elements_indices = indices[selected_indices]
    print(selected_elements_indices)
#
    ## Get the selected elements
    #selected_elements =target_pin_ids[selected_elements_indices[:, 0], selected_elements_indices[:, 1]]
    #print(selected_elements.unsqueeze(1))

    #for every user embedding multiply all the pin embeddings



    
    
    break

torch.Size([8, 5])
tensor([0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7])
tensor([1, 3, 3, 2, 5, 4, 4, 1])
tensor([ 1,  4,  7,  9, 14, 18, 22])
tensor([0, 0, 0, 0, 0, 0, 0, 0])
tensor([ 0,  1,  4,  7,  9, 14, 18, 22])
tensor([[0, 4],
        [1, 2],
        [2, 2],
        [3, 3],
        [4, 0],
        [5, 1],
        [6, 1],
        [7, 4]])


In [296]:
for batch in train_data_loader:

    pin_embeddings = nn.Embedding(pins_vocab, pin_embedding_dim)
    user, pin_ids, action_ids, target_pin_ids, target_action_ids = batch
    #print(user)
    #print(pin_ids.shape)
    #print(action_ids.shape)
    #print(target_pin_ids.shape)
    #print(target_pin_ids)

    #user_embeddings = embedding_decoder(pin_ids,action_ids)
    #print(user_embeddings.shape)

    #for each user select the last embedding
    #user_embeddings = user_embeddings[:,-1,:]
    # Create a mask of non-zero elements
    mask = target_pin_ids != 0 #shape [batch_size, seq_len]
    print(target_pin_ids)
    non_zero_elements = target_pin_ids[mask]
    print(non_zero_elements)
    indices = torch.multinomial(mask.float(), 1)
    indices[indices < 0] = 0
    print(indices)
    print(indices.shape)
    #print(indices.squeeze())
    print(indices.squeeze().shape)
    #print(torch.arange(indices.size(0)))
    # Reshape to 8x1
    print(torch.arange(target_pin_ids.shape[0]))
    random_numbers = target_pin_ids[torch.arange(target_pin_ids.shape[0]), indices.squeeze()]
    random_numbers = random_numbers.view(-1, 1)

    #print(random_numbers)   
    #embedded = pin_embeddings(random_numbers)
    #print(embedded.shape)
    #print(embedded.transpose(2,1).shape)
    # Get unique pins seen by user
    #negative sampling for each user
    # Find where the matrix has zeros
    unique_pins = torch.unique(target_pin_ids, dim=1)

       

  

    break

tensor([[  0,   0,   0,   0, 863],
        [  0,   0, 353, 781, 665],
        [  0,   0, 989, 979, 272],
        [  0,   0,   0,  27, 511],
        [651, 124, 476, 866, 443],
        [  0, 955, 345, 310,  63],
        [  0, 665, 288, 740, 103],
        [  0,   0,   0,   0, 841]])
tensor([863, 353, 781, 665, 989, 979, 272,  27, 511, 651, 124, 476, 866, 443,
        955, 345, 310,  63, 665, 288, 740, 103, 841])
tensor([[4],
        [3],
        [4],
        [3],
        [1],
        [4],
        [1],
        [4]])
torch.Size([8, 1])
torch.Size([8])
tensor([0, 1, 2, 3, 4, 5, 6, 7])


AttributeError: module 'torch' has no attribute 'random_choice'

In [300]:
class MyModel(nn.Module):
    def __init__(self, pins_vocab, pin_embedding_dim):
        super(MyModel, self).__init__()
        self.pin_embeddings = nn.Embedding(pins_vocab, pin_embedding_dim)

    def forward(self, pin_ids):
        return self.pin_embeddings(pin_ids)

# Outside your loop
model = MyModel(pins_vocab, pin_embedding_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for batch in train_data_loader:

    user, pin_ids, action_ids, target_pin_ids, target_action_ids = batch

    mask = target_pin_ids != 0 #shape [batch_size, seq_len]
    indices = torch.multinomial(mask.float(), 1)
    indices[indices < 0] = 0

    random_numbers = target_pin_ids[torch.arange(target_pin_ids.shape[0]), indices.squeeze()]
    random_numbers = random_numbers.view(-1, 1)

    embedded = model(random_numbers)

    dot_product = torch.bmm(embedded, embedded.transpose(2,1))

    sigmoid = torch.sigmoid(dot_product)
    #print(sigmoid.shape)
    #squeeze the sigmoid
    preds = sigmoid.squeeze(1)
    #print(preds)
    print(preds.shape)


    negative_pin_embeddings = self.pin_embedding(torch.randint(0,  pin_embedding_shape, (8,)))
        negative_pin_dot_product = torch.bmm(user_embeddings, negative_pin_embeddings.transpose(2,1))
        negative_pin_sigmoid = torch.sigmoid(negative_pin_dot_product)

        positiv_target = torch.ones_like(sigmoid)
        negative_target = torch.zeros_like(negative_pin_sigmoid)



    target = torch.ones_like(preds) # or torch.zeros_like(preds) if you want the model to output values close to 0
    loss = F.binary_cross_entropy(preds, target)
    print(loss)

    optimizer.zero_grad() #zero out the gradients
    loss.backward() #calculate the gradients
    optimizer.step() #update the weights

    break


torch.Size([8, 1])
tensor(0.0001, grad_fn=<BinaryCrossEntropyBackward0>)


In [314]:
pin_embedding = nn.Embedding(pins_vocab, pin_embedding_dim)
pin_embedding_shape = pin_embedding.weight.shape[0]
print(pin_embedding_shape)

#sample negative pin embeddings for batch size 8
negative_indices = torch.randint(0,  pin_embedding_shape, (8,))
print(negative_indices)
negative_pin_embeddings = pin_embedding(negative_indices)
print(negative_pin_embeddings.shape)

1001
tensor([525, 903, 207, 753, 366, 929,  34, 566])
torch.Size([8, 12])


In [335]:
import pytorch_lightning as pl
class RecommenderSystem(pl.LightningModule):

    def __init__(self, num_attention_heads,num_layers, pins_vocab,actions_vocab, hidden_size, pin_embedding_dim, action_embedding_dim, learning_rate=0.0001):
        super().__init__()
        self.user_embedding = TransferDecoder(num_attention_heads,num_layers, pins_vocab,actions_vocab, hidden_size)
        self.pin_embedding = nn.Embedding(pins_vocab, pin_embedding_dim)
        
    def forward(self, user ,pin_ids, action_ids,target_pin_ids):
        print(f'user shape {user.shape[0]}')
         #positive pins sampled from the target pins
        mask = target_pin_ids != 0 #shape [batch_size, seq_len]
        indices = torch.multinomial(mask.float(), 1)
        indices[indices < 0] = 0
    
        random_numbers = target_pin_ids[torch.arange(target_pin_ids.shape[0]), indices.squeeze()]
        random_numbers = random_numbers.view(-1, 1)
    
        user_embeddings = self.user_embedding(pin_ids,action_ids)
        print(f'  user_embeddings  {user_embeddings.shape}')
        #extract the last embedding for each user
        user_embeddings = user_embeddings[:,-1,:]
        print(f'  user_embeddings  {user_embeddings.shape}')

        
        user_embeddings = user_embeddings.unsqueeze(1)
        print(f'  user_embeddings  {user_embeddings.shape}')
       

        positive_pin_embeddings = self.pin_embedding(random_numbers)
        print(f'  positive_pin_embeddings  {positive_pin_embeddings.shape}')
    
        dot_product = torch.bmm(user_embeddings, positive_pin_embeddings.transpose(2,1))
        dot_product = dot_product.squeeze(1)
        print(f'  dot_product  {dot_product.shape}')
        sigmoid = torch.sigmoid(dot_product)
        #negative pins sampled randomly
        
        pin_embedding_shape = self.pin_embedding.weight.shape[0]
        print(pin_embedding_shape)

        #sample negative pin embeddings for batch size 8
        negative_indices = torch.randint(0,  pin_embedding_shape, (user.shape[0],))
        print(negative_indices)
        negative_pin_embeddings = self.pin_embedding(negative_indices)
        
        negative_pin_embeddings = negative_pin_embeddings.unsqueeze(1)
        print(f'  negative_pin_embeddings  {negative_pin_embeddings.shape}')
        print(f'  user_embeddings  {user_embeddings.shape}')

        negative_pin_dot_product = torch.bmm(user_embeddings, negative_pin_embeddings.transpose(2,1))
        negative_pin_dot_product = negative_pin_dot_product.squeeze(1)
        negative_pin_sigmoid = torch.sigmoid(negative_pin_dot_product)

        total_sigmoid = torch.cat([sigmoid, negative_pin_sigmoid], dim=1)
        
        targets = torch.ones_like(sigmoid)
        targets = torch.cat([targets, torch.zeros_like(negative_pin_sigmoid)], dim=1)


        preds = total_sigmoid.squeeze(1)
        
        return preds, targets
        
    
    def training_step(self, batch):
        user, pin_ids, action_ids, target_pin_ids, target_action_ids = batch
        preds, targets  = self.forward(user,pin_ids, action_ids, target_pin_ids)
        #targets = torch.ones_like(preds)
        loss = F.binary_cross_entropy(preds, targets)
        self.log('train_loss', loss)
        return loss


    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        return optimizer

model = RecommenderSystem(num_attention_heads,num_layers, pins_vocab,actions_vocab, hidden_size, pin_embedding_dim, action_embedding_dim, learning_rate=0.0001)
trainer = pl.Trainer(max_epochs=1)
trainer.fit(model, train_data_loader, test_data_loader)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")

  | Name           | Type            | Params
---------------------------------------------------
0 | user_embedding | TransferDecoder | 936 K 
1 | pin_embedding  | Embedding       | 12.0 K
---------------------------------------------------
948 K     Trainable params
0         Non-trainable params
948 K     Total params
3.794     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

user shape 8
  user_embeddings  torch.Size([8, 5, 12])
  user_embeddings  torch.Size([8, 12])
  user_embeddings  torch.Size([8, 1, 12])
  positive_pin_embeddings  torch.Size([8, 1, 12])
  dot_product  torch.Size([8, 1])
1001
tensor([346, 584, 164, 575, 126, 192, 609, 834])
  negative_pin_embeddings  torch.Size([8, 1, 12])
  user_embeddings  torch.Size([8, 1, 12])
user shape 8
  user_embeddings  torch.Size([8, 5, 12])
  user_embeddings  torch.Size([8, 12])
  user_embeddings  torch.Size([8, 1, 12])
  positive_pin_embeddings  torch.Size([8, 1, 12])
  dot_product  torch.Size([8, 1])
1001
tensor([467, 839, 917, 177, 870, 202, 325, 990])
  negative_pin_embeddings  torch.Size([8, 1, 12])
  user_embeddings  torch.Size([8, 1, 12])
user shape 8


`Trainer.fit` stopped: `max_epochs=1` reached.


  user_embeddings  torch.Size([8, 5, 12])
  user_embeddings  torch.Size([8, 12])
  user_embeddings  torch.Size([8, 1, 12])
  positive_pin_embeddings  torch.Size([8, 1, 12])
  dot_product  torch.Size([8, 1])
1001
tensor([ 55,  91, 820, 377, 220, 222, 885, 583])
  negative_pin_embeddings  torch.Size([8, 1, 12])
  user_embeddings  torch.Size([8, 1, 12])
user shape 7
  user_embeddings  torch.Size([7, 5, 12])
  user_embeddings  torch.Size([7, 12])
  user_embeddings  torch.Size([7, 1, 12])
  positive_pin_embeddings  torch.Size([7, 1, 12])
  dot_product  torch.Size([7, 1])
1001
tensor([ 79, 456, 807, 944, 249, 875, 820])
  negative_pin_embeddings  torch.Size([7, 1, 12])
  user_embeddings  torch.Size([7, 1, 12])


In [207]:
positve_pins = torch.ones(8,1)
negative_pins = torch.zeros(16,1)
combined_prediction = torch.cat([positve_pins, negative_pins], dim=0)
combined_prediction.shape



torch.Size([24, 1])