In [806]:
from transformers import AutoTokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling  
from transformers import Trainer, TrainingArguments 
import torch  
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer 
import torch.nn.functional as F 
import os
#os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
from torch import nn
from math import sqrt

import torch 
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline,  AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, Trainer
from transformers import AutoConfig, EarlyStoppingCallback, TrainerCallback
import torch.nn.functional as F
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [807]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split

# Set seed for reproducibility
np.random.seed(0)

# Create the list of pins, users and actions
pins = list(range(0,1000))
users = list(range(1,101))
actions = ['click', 'closeup', 'save']

# Creating empty DataFrame
df = pd.DataFrame(columns=['user', 'pin', 'action', 'timestamp'])

# Populating the DataFrame
for user in users:
    num_pins = np.random.randint(3, 21)  # user engages with min 3 pins and max 20 pins
    engaged_pins = np.random.choice(pins, num_pins, replace=False)  # engaged pins for this user
    engaged_actions = np.random.choice(actions, num_pins)  # actions for this user
    timestamps = [datetime.now() - timedelta(days=x) for x in range(num_pins)]  # random timestamps for user engagement
    
    temp_df = pd.DataFrame({
        'user': user,
        'pin': engaged_pins,
        'action': engaged_actions,
        'timestamp': timestamps
    })
    
    df = pd.concat([df, temp_df])


In [808]:
#convert timestap to date 
df['timestamp'] = pd.to_datetime(df['timestamp']).dt.date

In [809]:
#map actions to integers
action_to_int = {'click': 0, 'closeup': 1, 'save': 2}
df['action'] = df['action'].map(action_to_int)


In [810]:
# Split users into train and test users
train_users, test_users = train_test_split(users, test_size=0.2)

train_df = df[df['user'].isin(train_users)]
test_df = df[df['user'].isin(test_users)]


In [811]:
#find the max and min timestamp in train_df
#cutoff_times is 2 weeks prior to the max timestamp
max_timestamp = train_df['timestamp'].max()
min_timestamp = train_df['timestamp'].min()
cut_off_train = max_timestamp - timedelta(days=14)

# Create X_train and y_train
X_train = train_df[train_df['timestamp'] > cut_off_train]
y_train = train_df[(train_df['timestamp'] <= cut_off_train)]

# Create X_test and y_test
X_test = test_df[test_df['timestamp'] > cut_off_train]
y_test = test_df[(test_df['timestamp'] <= cut_off_train)]


In [812]:
y_train.head()

Unnamed: 0,user,pin,action,timestamp
14,1,862,1,2023-07-17
14,2,664,0,2023-07-17
15,2,780,1,2023-07-16
16,2,352,0,2023-07-15
14,4,271,1,2023-07-17


In [813]:


#if the list is empty add 5 0's or list is less then length 5 add 0 to make it 5 and it if greater then 5 trim it

def pad_or_trim(lst):
    if len(lst) == 0:
        return [0,0,0,0,0]
    elif len(lst) < 5:
        return [0]*(5-len(lst)) + lst
    else:
        return lst[:5]

X_train = X_train.sort_values(by=['timestamp'])
X_train = X_train.groupby('user').agg({'pin': list, 'action': list,'timestamp': list}).reset_index()
X_train['pin'] = X_train['pin'].apply(lambda x: pad_or_trim(x))
X_train['action'] = X_train['action'].apply(pad_or_trim)
X_train['timestamp'] = X_train['timestamp'].apply(pad_or_trim)

y_train = y_train.sort_values(by=['timestamp'])
y_train = y_train.groupby('user').agg({'pin': list, 'action': list,'timestamp': list}).reset_index()
y_train['pin'] = y_train['pin'].apply(pad_or_trim)
y_train['action'] = y_train['action'].apply(pad_or_trim)
y_train['timestamp'] = y_train['timestamp'].apply(pad_or_trim)


X_test = X_test.sort_values(by=['timestamp'])   
X_test = X_test.groupby('user').agg({'pin': list, 'action': list,'timestamp': list}).reset_index()
X_test['pin'] = X_test['pin'].apply(pad_or_trim)
X_test['action'] = X_test['action'].apply(pad_or_trim)
X_test['timestamp'] = X_test['timestamp'].apply(pad_or_trim)

y_test = y_test.sort_values(by=['timestamp'])
y_test = y_test.groupby('user').agg({'pin': list, 'action': list,'timestamp': list}).reset_index()
y_test['pin'] = y_test['pin'].apply(pad_or_trim)
y_test['action'] = y_test['action'].apply(pad_or_trim)
y_test['timestamp'] = y_test['timestamp'].apply(pad_or_trim)



In [814]:
X_train[X_train['user'] == 2]

Unnamed: 0,user,pin,action,timestamp
1,2,"[531, 163, 687, 778, 18]","[2, 2, 0, 1, 0]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2..."


In [815]:
y_train[y_train['user'] == 2]

Unnamed: 0,user,pin,action,timestamp
1,2,"[0, 0, 352, 780, 664]","[0, 0, 0, 1, 0]","[0, 0, 2023-07-15, 2023-07-16, 2023-07-17]"


In [816]:
#DEFINE ALL THE PARAMETERS FOR THE MODEL
hidden_size = 12 #hidden size of transformer and the final output coming from the transformer
pin_embedding_dim = 12 #dimension of the pin embedding
action_embedding_dim = 3 #dimension of the action embedding
num_attention_heads = 12 #number of attention heads in transformer that is concatenated together to form the final attention head of dimension 768
pins_vocab = 1001 #number of pins
actions_vocab = 3 #number of actions
num_layers = 12 #number of transformer layers in the model which is a replication of the same transformer layer whose input is the output of the previous layer and the output is the input to the next layer
dropout = 0.1
max_length = 5 #maximum length of the input sequence
batch_size = 8 #batch size for training
epochs = 1
learning_rate = 0.0001

## Embedding Module

In [817]:
import torch.nn as nn
class Embeddings(nn.Module):
  """
  Creates a single Dense Embedding for each token --> Token Embedding + Positional Embedding
  """
  def __init__(self, pin_embeddings,actions_vocab, action_embedding_dim, periods, hidden_size):
    super().__init__()
    self.pin_embedding = nn.Embedding.from_pretrained(pin_embeddings) #batch_size, seq_len, pin_embedding_dim i.e 5, 5, 384
    self.linear1 = nn.Linear(384, hidden_size) #linear layer to get the embeddings to the desired hidden size i.e 5, 5, 12
    self.pin_embedding.weight.requires_grad = True
    
    self.periods = periods
    self.linear2 = nn.Linear(len(periods)*2 + 1, 50) # Linear layer with output size 50 (arbitrarily chosen)  # 5, 5, 50
    self.action_type_embedding = nn.Embedding(actions_vocab, action_embedding_dim) #batch_size, seq_len, action_embedding_dim i.e 5, 5, 3

    #concatenate all the pin and action embeddings + time embeddings
    self.linear3 = nn.Linear(pin_embedding_dim+action_embedding_dim+50, hidden_size) #linear layer to get the embeddings to the desired hidden size i.e 5, 5, 12

    #position embedding
    self.position_embedding = nn.Embedding(5, hidden_size) #batch_size, seq_len, hidden_size i.e 5, 5, 12
    self.layer_norm = nn.LayerNorm(hidden_size, eps= 1e-12) #batch_size, seq_len, hidden_size i.e 5, 5, 12
    self.dropout = nn.Dropout()

  def forward(self,pin_ids,action_ids, timestamps):
    unix_timestamps = timestamps
    unix_timestamps = torch.unsqueeze(unix_timestamps, dim=-1) # 5, 5, 1
    features = []
    for period in self.periods:
        cos_features = torch.cos((2*np.pi*unix_timestamps)/period)
        sin_features = torch.sin((2*np.pi*unix_timestamps)/period)
        features.extend([cos_features, sin_features])
    features = torch.cat(features, dim=-1) # 5, 5, 10
    log_feature = torch.log1p(unix_timestamps) #squueze to add another dimension
    #torch.Size([1, 5, 8])  + torch.Size([1, 5, 1]) = torch.Size([1, 5, 9])
    features = torch.cat([features, log_feature], dim=-1) # 5, 5, 11
    # Pass features through linear layer
    out = self.linear2(features) # 5, 5, 50
    




    batch_size, seq_len = pin_ids.shape
    position_ids = torch.arange(seq_len, dtype=torch.long).expand((batch_size, seq_len))
    # position_ids = [batch_size, seq_len]
    pin_embeddings = self.pin_embedding(pin_ids) #batch_size, seq_len, pin_embedding_dim i.e 5, 5, 384
    #print(pin_embeddings.shape)
    pin_embeddings = self.linear1(pin_embeddings) #linear layer to get the embeddings to the desired hidden size i.e 5, 5, 12
    #print(pin_embeddings.shape)
    # pin_embeddings = [batch_size, seq_len, hidden_size]
    action_embeddings = self.action_type_embedding(action_ids) #batch_size, seq_len, action_embedding_dim i.e 5, 5, 3
    # action_embeddings = [batch_size, seq_len, hidden_size]
    position_embeddings = self.position_embedding(position_ids) #batch_size, seq_len, hidden_size i.e 5, 5, 12
    #print(position_embeddings.shape)
    
    
    #concatenate all the pin and action embeddings
    embeddings = torch.cat([pin_embeddings, action_embeddings , out], dim=-1) #batch_size, seq_len, hidden_size i.e 5, 5, 3+12+50 = 65
    #apply a linear layer to get the embeddings to the desired hidden size
    
    embeddings = self.linear3(embeddings) #linear layer to get the embeddings to the desired hidden size i.e 5, 5, 12
    
    #print(embeddings.shape)

    #add the position embeddings
    embeddings = embeddings + position_embeddings

    #embeddings = pin_embeddings + action_embeddings + position_embeddings
    # embeddings = [batch_size, seq_len, hidden_size]
    embeddings = self.layer_norm(embeddings)
    embeddings = self.dropout(embeddings)
    return embeddings
    

In [818]:
#create sample pinids of btch size 1, seq_len 5



## Loading Pin Embeddings 

In [819]:
import pickle
pin_embeddings_pretrained = pickle.load(open('embeddings.pkl', 'rb'))

In [820]:
pin_ids = torch.tensor([[1,2,3,4,5]])
print(pin_ids.shape)
action_ids = torch.tensor([[1,2,0,2,1]])
print(action_ids.shape)
timestamps = torch.tensor([[1,2,3,4,5]])
print(timestamps.shape)
periods = [1, 7, 30, 365]
embeddings = Embeddings(pin_embeddings_pretrained,actions_vocab, action_embedding_dim, periods, hidden_size)
embeddings.forward(pin_ids,action_ids, timestamps).shape

torch.Size([1, 5])
torch.Size([1, 5])
torch.Size([1, 5])


torch.Size([1, 5, 12])

## Attention Head

In [821]:
from torch import nn
class AttentionHead(nn.Module):
  def __init__(self, embed_dim, head_dim):
    super().__init__()
    self.head_dim = head_dim #dimension of one head 
    #infeatures=embed_dim
    #outfeatures=head_dim
    self.q = nn.Linear(embed_dim, head_dim)
    self.k = nn.Linear(embed_dim, head_dim)
    self.v = nn.Linear(embed_dim, head_dim)
    
  
  def causal_mask(self,batch_size,size, dtype):  
    mask = torch.tril(torch.ones(size,size)).unsqueeze(0)
    return mask
    
  
      
  def scaled_dot_product_attention(self,query, key, value):
    dim_k = query.size(dim=-1)  
    #print(dim_k)    
    #print(f'Dimension of the q,k,v Matrix [Batch_size, seq_len, Head_dim] of One Head {dim_k}')
    scores = torch.bmm(query,key.transpose(1,2))/ sqrt(dim_k)  #[(1,5,768)*(1,768,5)]/sqrt(768) >>> [batch_size,5,5] 
    
    mask = self.causal_mask(scores.size(0),scores.size(1),dtype=torch.int32)
    #print(mask)
    scores = scores.masked_fill(mask==0, float(0)) 
    weights = F.softmax(scores, dim=-1) #[batch_size,5,5]
    #print(weights)
    #print(f'Softmax for each column across one row {weights}')
    weights_dot_values = torch.bmm(weights,value)  #[batch_size,5,5]*[batch_size,5,64] >>> [batch_size,5,64]
    #print(f'Last Step is to multiply weights and values {weights_dot_values.shape}')
    return weights_dot_values 

  def forward(self, hidden_state):
    #head_state = [batch_size, seq_len, embed_dim]
    #print(f'Input Embedding for Each Token with X Matrix {hidden_state.size()}')
    #q = X*W_q
    q = self.q(hidden_state) #q = [batch_size, seq_len, head_dim]
    #print(f'Shape of the Query Matrix W_q {q.size()}')
    k = self.k(hidden_state) #k = [batch_size, seq_len, head_dim]
    #print(f'Shape of the Key Matrix W_k {k.size()}')
    v = self.k(hidden_state) #v = [batch_size, seq_len, head_dim]
    #print(f'Shape of the Value Matrix W_k {v.size()}')
    #print('-----------------Calculating Self Attention--------------------')
    attn_outputs = self.scaled_dot_product_attention(q,k,v) #attn_outputs = [batch_size, seq_len, head_dim]
    #print(f'Shape of the attention Output with one Head and Head Dimension {self.head_dim} is {attn_outputs.size()}')
    return attn_outputs

## Multihead Attention

In [822]:
class MultiHeadAttention(nn.Module):
  def __init__(self, hidden_size, num_attention_heads):
    super().__init__()
    embed_dim = hidden_size
    num_heads = num_attention_heads
    head_dim = embed_dim // num_heads
    self.heads = [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)] #initializing all the heads
    self.w_0 = nn.Linear(embed_dim,embed_dim) #the purpose of this linear layer is to concatenate all the heads together to form the final output of the multihead attention

  def forward(self,hidden_state):
    '''
    hidden_state: Input Embedding with dimensions [batch_size, seq_len, embedding_dimension]
    '''
    attention_outputs = [head(hidden_state) for head in self.heads] #Calculating Self-Attention on each head
    contcat_attn_outputs_allheads = torch.cat(attention_outputs, dim=-1) #[batch_size,seq_len, embed_dim]
    Z =   self.w_0(contcat_attn_outputs_allheads) #[batch_size, seq_len, embed_dim]
    return Z

## Feedforward Module

In [823]:
class FeedForward(nn.Module):
  def __init__(self,hidden_size):
    super().__init__()
    self.linear1 = nn.Linear(hidden_size, 3072)
    self.linear2 = nn.Linear(3072, hidden_size)
    self.gelu = nn.GELU()
    self.dropout = nn.Dropout(0.1)
  
  def forward(self, attention_outputs):
    output_l1 = self.linear1(attention_outputs)
    activated_outputs = self.gelu(output_l1)
    output_l2 = self.linear2(activated_outputs)
    output = self.dropout(output_l2)
    return output

## Skip Connections + MHAL

In [824]:
class TransformerDecoderLayer(nn.Module):
  def __init__(self, hidden_size, num_attention_heads):
    super(TransformerDecoderLayer,self).__init__()
    self.layer_norm1 = nn.LayerNorm(hidden_size)
    self.layer_norm2 = nn.LayerNorm(hidden_size)
    self.multi_attention = MultiHeadAttention(hidden_size, num_attention_heads)
    self.feedforward = FeedForward(hidden_size)

  def forward(self, input_embeddings):
     #pre-layer normalization approach
     
     #Step 1: Applying Layer Normalization to Input Embeddings
     normalized_input_embeddings = self.layer_norm1(input_embeddings)
     
     #Step 2: Applying MultiHeadAttention to Normalized Output
     multi_head_attn = self.multi_attention(normalized_input_embeddings)
     
     #Step 3: Add input embeddings to the Multihead Attention Output
     skip_connection_1 = input_embeddings + multi_head_attn

     #step 4: Pass the output to another Layer Normalization 
     layer_norm_2 = self.layer_norm2(skip_connection_1)

     #Step 5: Adding skip connection 1 outputs to the output of the FeedForward Network (applied on Step 4)
     skip_connection_2 = skip_connection_1 + self.feedforward(layer_norm_2)
     #print(f'output of MultiHeadAttention and FeedForward Network is {skip_connection_2.shape}')
     return skip_connection_2

## Transformer Decoder ( Putting it all together )

In [825]:
class TransferDecoder(nn.Module):
  def __init__(self,num_attention_heads,num_layers, pin_embedding,actions_vocab,action_embedding_dim, periods, hidden_size):
    super().__init__()
    self.embedding = Embeddings(pin_embedding,actions_vocab,action_embedding_dim, periods, hidden_size)
    self.layers = nn.ModuleList([TransformerDecoderLayer(hidden_size, num_attention_heads) for _ in range(num_layers)]) 
                                
  def forward(self, pin_ids, action_ids, timestamps):
    embeddings = self.embedding(pin_ids, action_ids, timestamps) #in: [batch_size, seq_len] out: [batch_size, seq_len, hidden_size]
    for layer in self.layers:
      embeddings = layer(embeddings)
    return embeddings


## Dataset 

In [826]:
#import date 
from datetime import date
class RecommenderDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user = self.df.iloc[idx]['user']
        pin_ids = torch.tensor(self.df.iloc[idx]['pin_train'])
        action_ids = torch.tensor(self.df.iloc[idx]['action_train'])
        target_pin_ids = torch.tensor(self.df.iloc[idx]['pin_target'])
        target_action_ids = torch.tensor(self.df.iloc[idx]['action_target'])
        # Convert datetime.date to UNIX timestamps (in days)
        time_stamps = [(ts - date(1970,1,1)).total_seconds() / 86400.0 for ts in self.df.iloc[idx]['timestamp_train']]
        time_stamps = torch.tensor(time_stamps)
        #timestamp shpild be batch_size, seq_len i.e 5, 5
        #time_stamps = torch.unsqueeze(time_stamps, dim=-1)
        return {'user': user, 'pin_ids': pin_ids, 'action_ids': action_ids, 'target_pin_ids': target_pin_ids, 'target_action_ids': target_action_ids, 'time_stamps': time_stamps}
       
      


In [827]:
X_train.head()

Unnamed: 0,user,pin,action,timestamp
0,1,"[239, 985, 999, 674, 994]","[1, 0, 1, 2, 2]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2..."
1,2,"[531, 163, 687, 778, 18]","[2, 2, 0, 1, 0]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2..."
2,3,"[993, 179, 297, 760, 124]","[1, 0, 2, 0, 0]","[2023-07-22, 2023-07-23, 2023-07-24, 2023-07-2..."
3,4,"[630, 738, 798, 344, 885]","[2, 0, 2, 1, 2]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2..."
4,5,"[439, 332, 36, 380, 460]","[0, 2, 1, 0, 1]","[2023-07-23, 2023-07-24, 2023-07-25, 2023-07-2..."


In [828]:
merged_train = X_train.merge(y_train, on='user', how='inner')
merged_test = X_test.merge(y_test, on='user', how='inner')

In [829]:
merged_train.rename(columns={'pin_x': 'pin_train', 'action_x': 'action_train', 'timestamp_x': 'timestamp_train', 'pin_y':'pin_target', 'action_y': 'action_target', 'timestamp_y': 'timestamp_target'}, inplace=True)
merged_test.rename(columns={'pin_x': 'pin_train', 'action_x': 'action_train', 'timestamp_x': 'timestamp_train', 'pin_y':'pin_target', 'action_y': 'action_target', 'timestamp_y': 'timestamp_target'}, inplace=True)

In [830]:
merged_train.head()

Unnamed: 0,user,pin_train,action_train,timestamp_train,pin_target,action_target,timestamp_target
0,1,"[239, 985, 999, 674, 994]","[1, 0, 1, 2, 2]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2...","[0, 0, 0, 0, 862]","[0, 0, 0, 0, 1]","[0, 0, 0, 0, 2023-07-17]"
1,2,"[531, 163, 687, 778, 18]","[2, 2, 0, 1, 0]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2...","[0, 0, 352, 780, 664]","[0, 0, 0, 1, 0]","[0, 0, 2023-07-15, 2023-07-16, 2023-07-17]"
2,4,"[630, 738, 798, 344, 885]","[2, 0, 2, 1, 2]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2...","[0, 0, 988, 978, 271]","[0, 0, 0, 2, 1]","[0, 0, 2023-07-15, 2023-07-16, 2023-07-17]"
3,17,"[705, 442, 385, 103, 938]","[0, 1, 2, 2, 2]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2...","[0, 0, 0, 26, 510]","[0, 0, 0, 0, 1]","[0, 0, 0, 2023-07-16, 2023-07-17]"
4,20,"[558, 986, 672, 125, 354]","[1, 0, 2, 0, 2]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2...","[650, 123, 475, 865, 442]","[2, 0, 2, 1, 0]","[2023-07-13, 2023-07-14, 2023-07-15, 2023-07-1..."


## Dataloader

In [831]:
train_dataset = RecommenderDataset(merged_train)
test_dataset = RecommenderDataset(merged_test)

train_data_loader = DataLoader(train_dataset, batch_size=8, shuffle=False)
test_data_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [832]:
#sample = next(iter(train_data_loader))
for batch in train_data_loader:
  print(batch['pin_ids'].shape)
  print(batch['action_ids'].shape)
  print(batch['target_pin_ids'].shape)
  print(batch['target_action_ids'].shape)
  print(batch['time_stamps'].shape)
  break


torch.Size([8, 5])
torch.Size([8, 5])
torch.Size([8, 5])
torch.Size([8, 5])
torch.Size([8, 5])


In [833]:
merged_test 

Unnamed: 0,user,pin_train,action_train,timestamp_train,pin_target,action_target,timestamp_target
0,12,"[925, 279, 516, 14, 422]","[1, 2, 1, 2, 1]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2...","[0, 0, 0, 345, 98]","[0, 0, 0, 1, 1]","[0, 0, 0, 2023-07-16, 2023-07-17]"
1,22,"[289, 872, 511, 798, 891]","[1, 1, 0, 0, 0]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2...","[448, 917, 861, 976, 888]","[0, 0, 1, 1, 2]","[2023-07-12, 2023-07-13, 2023-07-14, 2023-07-1..."
2,39,"[342, 942, 490, 141, 78]","[2, 2, 0, 0, 1]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2...","[0, 0, 210, 914, 945]","[0, 0, 2, 0, 0]","[0, 0, 2023-07-15, 2023-07-16, 2023-07-17]"
3,45,"[493, 375, 755, 106, 973]","[1, 2, 2, 1, 0]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2...","[0, 0, 0, 133, 669]","[0, 0, 0, 2, 1]","[0, 0, 0, 2023-07-16, 2023-07-17]"
4,63,"[251, 553, 760, 127, 332]","[1, 1, 2, 0, 2]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2...","[0, 0, 0, 0, 157]","[0, 0, 0, 0, 1]","[0, 0, 0, 0, 2023-07-17]"
5,70,"[742, 217, 562, 615, 211]","[2, 0, 1, 1, 0]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2...","[807, 473, 703, 28, 165]","[2, 0, 1, 2, 1]","[2023-07-12, 2023-07-13, 2023-07-14, 2023-07-1..."
6,82,"[675, 514, 671, 620, 222]","[1, 0, 0, 2, 1]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2...","[0, 0, 0, 486, 965]","[0, 0, 0, 1, 1]","[0, 0, 0, 2023-07-16, 2023-07-17]"
7,88,"[311, 22, 584, 95, 213]","[2, 1, 1, 0, 2]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2...","[0, 0, 369, 489, 883]","[0, 0, 1, 1, 1]","[0, 0, 2023-07-15, 2023-07-16, 2023-07-17]"
8,89,"[167, 988, 599, 266, 999]","[0, 0, 1, 1, 2]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2...","[870, 826, 963, 164, 20]","[1, 1, 0, 0, 0]","[2023-07-13, 2023-07-14, 2023-07-15, 2023-07-1..."
9,94,"[16, 661, 831, 808, 529]","[0, 0, 0, 1, 0]","[2023-07-18, 2023-07-19, 2023-07-20, 2023-07-2...","[416, 947, 171, 677, 574]","[2, 2, 1, 2, 2]","[2023-07-12, 2023-07-13, 2023-07-14, 2023-07-1..."


In [834]:
import pytorch_lightning as pl
class RecommenderSystem(pl.LightningModule):

    def __init__(self, num_attention_heads,num_layers, pin_embeddings ,actions_vocab, hidden_size, action_embedding_dim,periods, learning_rate=0.0001):
        super().__init__()
        self.user_embedding = TransferDecoder(num_attention_heads,num_layers, pin_embeddings,actions_vocab,action_embedding_dim, periods, hidden_size)
        #convert the numpy array to a tensor and then to an embedding layer
        pin_embeddings = torch.tensor(pin_embeddings)
        self.pin_embedding = nn.Embedding.from_pretrained(pin_embeddings)
        self.pin_embedding.weight.requires_grad = True
        self.linear = nn.Linear(384, hidden_size)

        


    def forward(self, user_embeddings, pin_embeddings):
        dot_product = torch.bmm(user_embeddings, pin_embeddings.transpose(2,1))
        dot_product = dot_product.squeeze(1)
        sigmoid = torch.sigmoid(dot_product)
        preds = sigmoid.squeeze(1)
        return preds
        
    
    def training_step(self, batch):
        
        user = batch['user']
        pin_ids = batch['pin_ids']
        action_ids = batch['action_ids']
        target_pin_ids = batch['target_pin_ids']
        timestamps = batch['time_stamps']
        
        
        #user embeddings
        mask = target_pin_ids != 0 #shape [batch_size, seq_len]
        indices = torch.multinomial(mask.float(), 1)
        indices[indices < 0] = 0
        random_numbers = target_pin_ids[torch.arange(target_pin_ids.shape[0]), indices.squeeze()]
        random_numbers = random_numbers.view(-1, 1)
        user_embeddings = self.user_embedding(pin_ids,action_ids, timestamps) #in: [batch_size, seq_len] out: mean_pooling [batch_size, seq_len, hidden_size]
        user_embeddings = user_embeddings[:,-1,:] #extract the last embedding for each user [batch_size, hidden_size]
        user_embeddings = user_embeddings.unsqueeze(1) #add a dimension for the sequence length [batch_size, 1, hidden_size]

        #positive pin embeddings
        positive_pin_embeddings = self.pin_embedding(random_numbers) #shape [batch_size, 1, hidden_size]
        positive_pin_embedding  = self.linear(positive_pin_embeddings) #shape [batch_size, 1, hidden_size]
        positive_preds = self.forward(user_embeddings, positive_pin_embedding) #shape [batch_size, 1]
        

        #negative pin embeddings
        pin_embedding_shape = self.pin_embedding.weight.shape[0] #shape [num_pins, hidden_size]
        negative_indices = torch.randint(0,  pin_embedding_shape, (user.shape[0],)) #shape [batch_size,]
        negative_pin_embeddings = self.pin_embedding(negative_indices) #shape [batch_size, hidden_size]
        negative_pin_embeddings = self.linear(negative_pin_embeddings) #shape [batch_size, hidden_size]
        negative_pin_embeddings = negative_pin_embeddings.unsqueeze(1) #shape [batch_size, 1, hidden_size]
        negative_pin_preds = self.forward(user_embeddings, negative_pin_embeddings) #shape [batch_size, 1]
        
        #concatenate the positive and negative pin preds
        preds = torch.cat([positive_preds, negative_pin_preds]) #shape [batch_size*2, 1]
        targets = torch.ones_like(positive_preds) #shape [batch_size, 1]
        targets = torch.cat([targets, torch.zeros_like(negative_pin_preds)]) #shape [batch_size*2, 1]
         #shape [batch_size*2, 1]

        #calculate the loss
        loss = F.binary_cross_entropy(preds, targets)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        user = batch['user']
        pin_ids = batch['pin_ids']
        action_ids = batch['action_ids']
        target_pin_ids = batch['target_pin_ids']
        timestamps = batch['time_stamps']

        #user embeddings
        mask = target_pin_ids != 0 #shape [batch_size, seq_len]
        indices = torch.multinomial(mask.float(), 1)
        indices[indices < 0] = 0
        random_numbers = target_pin_ids[torch.arange(target_pin_ids.shape[0]), indices.squeeze()]
        random_numbers = random_numbers.view(-1, 1)
        user_embeddings = self.user_embedding(pin_ids,action_ids, timestamps) #in: [batch_size, seq_len] out: mean_pooling [batch_size, seq_len, hidden_size]
        user_embeddings = user_embeddings[:,-1,:] #extract the last embedding for each user [batch_size, hidden_size]
        user_embeddings = user_embeddings.unsqueeze(1) #add a dimension for the sequence length [batch_size, 1, hidden_size]

        #positive pin embeddings
        positive_pin_embeddings = self.pin_embedding(random_numbers) #shape [batch_size, 1, hidden_size]
        positive_pin_embedding  = self.linear(positive_pin_embeddings) #shape [batch_size, 1, hidden_size]
        positive_preds = self.forward(user_embeddings, positive_pin_embedding) #shape [batch_size, 1]

        #negative pin embeddings
        pin_embedding_shape = self.pin_embedding.weight.shape[0] #shape [num_pins, hidden_size]
        negative_indices = torch.randint(0,  pin_embedding_shape, (user.shape[0],)) #shape [batch_size,]
        negative_pin_embeddings = self.pin_embedding(negative_indices) #shape [batch_size, hidden_size]
        negative_pin_embeddings = self.linear(negative_pin_embeddings) #shape [batch_size, hidden_size]
        negative_pin_embeddings = negative_pin_embeddings.unsqueeze(1) #shape [batch_size, 1, hidden_size]
        negative_pin_preds = self.forward(user_embeddings, negative_pin_embeddings) #shape [batch_size, 1]

        #concatenate the positive and negative pin preds
        preds = torch.cat([positive_preds, negative_pin_preds]) #shape [batch_size*2, 1]
        targets = torch.ones_like(positive_preds) #shape [batch_size, 1]
        targets = torch.cat([targets, torch.zeros_like(negative_pin_preds)]) #shape [batch_size*2, 1]

        

        #calculate the loss
        loss = F.binary_cross_entropy(preds, targets)
        self.log('val_loss', loss)
        return loss


    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        return optimizer


#load the model
periods =  [1, 7, 30, 90, 365]
model = RecommenderSystem(num_attention_heads,num_layers, pin_embeddings_pretrained ,actions_vocab, hidden_size, action_embedding_dim,periods, learning_rate=0.0001)

trainer = pl.Trainer(max_epochs=1, log_every_n_steps=1)

trainer.fit(model, train_data_loader, test_data_loader)

  pin_embeddings = torch.tensor(pin_embeddings)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name           | Type            | Params
---------------------------------------------------
0 | user_embedding | TransferDecoder | 1.3 M 
1 | pin_embedding  | Embedding       | 384 K 
2 | linear         | Linear          | 4.6 K 
---------------------------------------------------
1.7 M     Trainable params
0         Non-trainable params
1.7 M     Total params
6.812     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


In [835]:
train_dataset[0]

{'user': 1,
 'pin_ids': tensor([239, 985, 999, 674, 994]),
 'action_ids': tensor([1, 0, 1, 2, 2]),
 'target_pin_ids': tensor([  0,   0,   0,   0, 862]),
 'target_action_ids': tensor([0, 0, 0, 0, 1]),
 'time_stamps': tensor([19556., 19557., 19558., 19559., 19560.])}

In [837]:
from transformers import Trainer, TrainingArguments
from transformers import DataCollator

train_dataset = RecommenderDataset(merged_train)
test_dataset = RecommenderDataset(merged_test)
data_collator =  DataCollator()

train_data_loader = DataLoader(train_dataset, batch_size=8, shuffle=False)
test_data_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)



model = RecommenderSystem(num_attention_heads,num_layers, pin_embeddings_pretrained ,actions_vocab, hidden_size, action_embedding_dim,periods, learning_rate=0.0001)

args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=1,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=1,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=args,                           # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset            # evaluation dataset
)

trainer.train()


  pin_embeddings = torch.tensor(pin_embeddings)


  0%|          | 0/8 [00:00<?, ?it/s]

ValueError: The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: user_embeddings,pin_embeddings,label_ids,label.

In [None]:
def recall_at_k(all_preds, true_positives, k):
        top_k = all_preds.topk(k, dim=-1) # Get top k predictions
        true_positives = true_positives.unsqueeze(1).expand_as(top_k) # Expand true positives to match size of top_k
        recalls = (top_k == true_positives).sum(dim=-1).float() / true_positives.size(-1) # Compute recalls
        return recalls.mean() # Re

In [None]:
#create sample preds as 

## Saving and Loading Model

In [None]:
#save the model
torch.save(model.state_dict(), 'model.pt')

#load the model
periods =  [1, 7, 30, 90, 365]
model = RecommenderSystem(num_attention_heads,num_layers, pin_embeddings_pretrained ,actions_vocab, hidden_size, action_embedding_dim,periods, learning_rate=0.0001)
model.load_state_dict(torch.load('model.pt'))

In [None]:
#provide a user id and get the top 10 recommendations
#random new user with 5 pins randomly selected

#sample a new user
new_user = torch.randint(0,  pins_vocab, (1,5))
new_user_actions = torch.randint(0,  actions_vocab, (1,5))
new_user_timestamps = torch.randint(0,  100, (1,5))
print(new_user)

## Running TOP 10 Recommendations for the new user

In [None]:
#make top 10 recommendations
model.eval()
with torch.no_grad():
    user_embeddings = model.user_embedding(new_user,new_user_actions, new_user_timestamps)
    user_embeddings = user_embeddings[:,-1,:]
    user_embeddings = user_embeddings.unsqueeze(1)
    print(user_embeddings.shape)
    
    #get the pin embeddings
    pin_embeddings = model.pin_embedding.weight
    pin_embeddings = model.linear(pin_embeddings)
    pin_embeddings = pin_embeddings.unsqueeze(0)
    print(pin_embeddings.shape)

    dot_product = torch.bmm(user_embeddings, pin_embeddings.transpose(2,1))
    dot_product = dot_product.squeeze(1)

    sigmoid = torch.sigmoid(dot_product)
    #print(sigmoid.shape)
    #squeeze the sigmoid
    preds = sigmoid.squeeze(1)
    #print(preds)
    print(preds.shape)

    #get the top 10 recommendations
    top_10 = torch.topk(preds, 10)

    #get the indices of the top 10 recommendations
    top_10_indices = top_10.indices
    print(top_10_indices)

In [None]:
data = pd.read_csv("topic_facts_3.csv")
data.head()

In [None]:
#conver the new_user to a list of indexes
new_user = new_user.tolist()[0]
data.iloc[new_user]

In [None]:
#predict the top 10 pins for the new user
data.iloc[top_10_indices.tolist()[0]]


In [None]:
import torch
import torch.nn as nn
import numpy as np
from datetime import datetime

class Time2Vec(nn.Module):
    def __init__(self, periods):
        super(Time2Vec, self).__init__()
        
        self.periods = periods
        self.linear = nn.Linear(len(periods)*2 + 1, 50) # Linear layer with output size 50 (arbitrarily chosen)

    def forward(self, timestamps):
        # Convert 'YYYY-MM-DD' format to UNIX timestamps
        unix_timestamps = torch.tensor([[(datetime.strptime(ts, "%Y-%m-%d") - datetime(1970,1,1)).total_seconds() for ts in batch] for batch in timestamps])
        unix_timestamps = unix_timestamps / 86400.0 # convert seconds to days
        unix_timestamps = unix_timestamps.unsqueeze(-1) # add an extra dimension for broadcasting
        print(unix_timestamps.shape)
        # Calculate the periodic features
        features = []
        for period in self.periods:
            cos_features = torch.cos((2*np.pi*unix_timestamps)/period)
            sin_features = torch.sin((2*np.pi*unix_timestamps)/period)
            features.extend([cos_features, sin_features])


        features = torch.cat(features, dim=-1)
        print(features.shape)

        # Calculate the log feature
        log_feature = torch.log1p(unix_timestamps)
        features = torch.cat([features, log_feature], dim=-1)
        print(features.shape)

        # Pass features through linear layer
        out = self.linear(features)

        return out

# Define the periods
p = [1, 7, 30, 90, 365]

# Initialize the Time2Vec module
time2vec = Time2Vec(p)

# Create a batch of 'YYYY-MM-DD' timestamps

# Create a batch of 'YYYY-MM-DD' timestamps
timestamps = [["2023-07-12", "2023-07-13", "2023-07-14"], ["2023-07-15", "2023-07-16", "2023-07-17"]]


# Create time embeddings
embeddings = time2vec(timestamps)

print(embeddings.shape)



