In [22]:
import pandas as pd

In [25]:
reviews_df = pd.read_csv('data/test_raw_sample_reviews_PA.csv')

reviews_df['text'][0]

"I have spent a lot of time in New England. I have never seen a lobster roll shop outside of that area. So I went to Quincy's because I've seen it being built for a while. I didn't know what to expect. First of all it's very clean. Spotless.  The decor is more than I expected.  Pretty neat. Not cheesy and chain like. But definitely themed. The menu is simple. The prices are surprisingly very very reasonable. The rolls are loaded with meat.  I had the main liner. When it comes to lobster roll there is no special ingredient. The key is freshness and the amount of lobster for the price. Quincy's scored on both. The bisque is also very good. However if you want bisque with heaps of lobster you have to make sure you add it to your order. I ordered my bisque and it had no meat. And I asked why and they said lobster meat could be added. It was already pretty tasty but for a few dollars more the amount of fresh lobster added was incredible. This is a nice addition to the area and I will be bac

In [3]:
reviews_df['text'] = reviews_df['text'].str.replace(r'http\S+', '', regex=True)
reviews_df['text'] = reviews_df['text'].str.replace(r'@\w+', '', regex=True)
reviews_df['text'] = reviews_df['text'].astype(str)
reviews_df['text'] = reviews_df['text'].apply(str)

In [4]:

def combine_reviews_with_sep(review_texts, tokenizer):
    """
    Combines all review texts into one string with the [SEP] token separating them.
    
    Parameters:
    - review_texts (list of str): A list of review texts to be combined.
    - tokenizer: The tokenizer used to obtain the [SEP] token.
    
    Returns:
    - combined_text (str): The combined reviews separated by the [SEP] token.
    """

    count = 0
    # Get the separator token [SEP] using the tokenizer
    sep_token = tokenizer.sep_token

    print(f"Separator token: {sep_token} + count: {count}")
    count += 1

    # Combine all reviews with the [SEP] token in between
    combined_text = f" {sep_token} ".join(review_texts)

    print(f"Separator token: {sep_token}")

    
    return combined_text

In [6]:
import torch
from torch import nn
from transformers import BertModel, T5ForConditionalGeneration, AutoTokenizer, T5Tokenizer

#Try this model first to see if it is working current error is in the attention layer/embeddings
class MultiAttentionTransformer(nn.Module):
    def __init__(self, bert_model_name="bert-base-uncased", t5_model_name="t5-small"):
        super(MultiAttentionTransformer, self).__init__()

        
        self.bert_model = BertModel.from_pretrained(bert_model_name)
        self.t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)

        # Add additional layers or mechanisms for multi-source attention (optional)
        self.attn_layer = nn.MultiheadAttention(embed_dim=768, num_heads=8)

    def forward(self, reviews, business_metadata, prompt):
        # Get the BERT embeddings for review history and business metadata
        review_embeddings = self.bert_model(**reviews).last_hidden_state
        business_embeddings = self.bert_model(**business_metadata).last_hidden_state

        #Concatenate both review and business embeddings
        combined_embeddings = torch.cat([review_embeddings, business_embeddings], dim=1)

        # Apply multi-head attention
        attn_output, _ = self.attn_layer(combined_embeddings, combined_embeddings, combined_embeddings)




        generated_output = self.t5_model.generate(input_ids=prompt, encoder_outputs=attn_output)

        # Decode the output to text
        generated_review = self.t5_tokenizer.decode(generated_output[0], skip_special_tokens=True)

        return generated_review



In [7]:
def tokenize_reviews(reviews, tokenizer):
    """
    Tokenizes the reviews in the DataFrame using the provided tokenizer.
    
    Parameters:
    - reviews_df (DataFrame): The DataFrame containing the reviews.
    - tokenizer: The tokenizer used for tokenization.
    
    Returns:
    - tokenized_reviews (list of list of int): A list of tokenized reviews.
    """

    # Ensure that all elements in reviews are strings

    # Tokenize each review and convert to IDs
    encoded_reviews = tokenizer(reviews, padding=True, truncation=True, max_length=512, return_tensors='pt')
    # Move the model and data to the GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenized_reviews = {key: value.to(device) for key, value in encoded_reviews.items()}



    return tokenized_reviews

In [9]:
class MultiAttentionTransformerWithCopy(nn.Module):
    def __init__(self, bert_model_name="bert-base-uncased", t5_model_name="t5-small"):
        super(MultiAttentionTransformerWithCopy, self).__init__()

        # Initialize BERT tokenizer and model
        self.bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
        self.bert_model = BertModel.from_pretrained(bert_model_name)

        # Initialize T5 tokenizer and model
        self.t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
        self.t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)

        # Add additional layers or mechanisms for multi-source attention (optional)
        self.attn_layer = nn.MultiheadAttention(embed_dim=768, num_heads=8)

    def forward(self, review_texts, business_metadata, prompt, past_reviews):
        """
        Forward pass for the model that includes a copy mechanism and processes a list of reviews.
        
        Parameters:
        - review_texts: List of text strings for reviews.
        - business_metadata: Metadata about the business (could include info like location, type, etc.).
        - prompt: The user's input or prompt for the generated review.
        - past_reviews: Texts of past reviews for the same business.
        
        Returns:
        - Generated review text.
        """

        # Tokenize and encode all reviews in the review_texts list
        review_tokens = self.bert_tokenizer(review_texts, return_tensors='pt', padding=True, truncation=True, is_split_into_words=False)
        business_tokens = self.bert_tokenizer(business_metadata, return_tensors='pt', padding=True, truncation=True, is_split_into_words=False)
        past_reviews_tokens = self.bert_tokenizer(past_reviews, return_tensors='pt', padding=True, truncation=True, is_split_into_words=False)

        # Get the BERT embeddings for review history, business metadata, and past reviews
        review_embeddings = self.bert_model(**review_tokens).last_hidden_state
        business_embeddings = self.bert_model(**business_tokens).last_hidden_state
        past_reviews_embeddings = self.bert_model(**past_reviews_tokens).last_hidden_state

        # Concatenate all embeddings: review + business metadata + past reviews
        combined_embeddings = torch.cat([review_embeddings, business_embeddings, past_reviews_embeddings], dim=1)

        # Apply multi-source attention (simple multihead attention for demonstration)
        attn_output, _ = self.attn_layer(combined_embeddings, combined_embeddings, combined_embeddings)

        # Implement the copy mechanism: We extract attention scores to decide which tokens to copy
        copy_tokens = self.select_copy_tokens(attn_output, past_reviews_embeddings)

        # Prepare the prompt for the T5 model
        prompt_input = self.t5_tokenizer(prompt, return_tensors='pt', padding=True, truncation=True)
        
        # Use the attention output as the decoder input and generate the review with T5
        decoder_input_ids = prompt_input['input_ids']
        generated_output = self.t5_model.generate(input_ids=decoder_input_ids, decoder_input_ids=attn_output)

        # Decode the output to text
        generated_review = self.t5_tokenizer.decode(generated_output[0], skip_special_tokens=True)

        # Integrate copy mechanism into the generated review
        generated_review_with_copy = self.apply_copy_mechanism(generated_review, copy_tokens)

        return generated_review_with_copy

    def select_copy_tokens(self, attn_output, past_reviews_embeddings):
        """
        Select which tokens from past reviews to copy based on attention scores.
        
        Parameters:
        - attn_output: The output from the multihead attention layer.
        - past_reviews_embeddings: Embeddings from past reviews.
        
        Returns:
        - copy_tokens: The tokens that will be copied from past reviews.
        """
        # Calculate the attention scores
        attention_scores = torch.mean(attn_output, dim=1)  # Averaging across attention heads for simplicity
        
        # Ensure that the top-k tokens are within bounds
        k = 5  # Number of top tokens to select
        num_tokens = attention_scores.size(1)  # Get the number of tokens in the attention output
        num_past_tokens = past_reviews_embeddings.size(1)  # Get the number of tokens in past reviews
        if num_tokens > num_past_tokens:
            # Ensure k does not exceed the number of available tokens
            k = min(k, num_past_tokens)
        else:
            # If there are fewer tokens in past reviews, adjust k accordingly
            k = min(k, num_tokens)
        
        # Get the top k attention scores
        top_attention_indices = torch.topk(attention_scores, k=k, dim=-1).indices  # Select top k tokens

        # Extract the tokens corresponding to the top attention indices
        copy_tokens = past_reviews_embeddings[:, top_attention_indices]

        return copy_tokens

    def apply_copy_mechanism(self, generated_review, copy_tokens):
        """
        Apply the copy mechanism to the generated review by inserting copied tokens.
        
        Parameters:
        - generated_review: The review generated by T5.
        - copy_tokens: The tokens selected for copying from past reviews.
        
        Returns:
        - modified_review: The final review with copied phrases inserted.
        """
        # In this simple case, we'll append the copied tokens to the generated review.
        # More sophisticated copy mechanisms would replace or interleave tokens in the review.

        copy_text = " ".join([self.bert_tokenizer.decode(token) for token in copy_tokens[0]])  # Decode copied tokens
        modified_review = generated_review + " " + copy_text

        return modified_review

In [8]:
# Function to get all reviews for a given business ID
def get_reviews_for_business(business_id, reviews_df):
    # Filter the dataset for the given business_id
    business_reviews = reviews_df[reviews_df['business_id'] == business_id]
    return business_reviews



In [9]:
def get_reviews_for_user(user_id, reviews_df):
    # Filter the dataset for the given user_id
    user_reviews = reviews_df[reviews_df['user_id'] == user_id]
    return user_reviews

In [19]:
# Initialize the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

def generate_review_with_t5(reviews, business_metadata, prompt, tokenizer, model, max_len=512):
    """
    Generates a review based on all reviews, business metadata, and a given prompt.
    
    Parameters:
    - reviews (list of str): List of review texts.
    - business_metadata (str): The business metadata (e.g., business description).
    - prompt (str): The input prompt to guide the generation.
    - tokenizer: Pre-trained T5 tokenizer.
    - model: Pre-trained T5 model.
    - max_len (int): Maximum length for padding/truncation.
    
    Returns:
    - generated_review (str): The generated review text.
    """
    
    # Tokenize and encode the reviews and business metadata
    review_input = tokenizer(reviews, padding=True, truncation=True, max_length=max_len, return_tensors='pt')
    business_input = tokenizer(business_metadata, padding=True, truncation=True, max_length=max_len, return_tensors='pt')

    # Combine reviews and business metadata into one input string
    combined_input_ids = torch.cat([review_input['input_ids'], business_input['input_ids']], dim=1)

    # Concatenate with the prompt
    prompt_input = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=max_len)

    # Get the encoder's output (reviews + business metadata)
    encoder_input_ids = combined_input_ids.to(model.device)
    attention_mask = torch.cat([review_input['attention_mask'], business_input['attention_mask']], dim=1).to(model.device)

    # Generate the output based on the input + prompt
    generated_ids = model.generate(
        input_ids=prompt_input['input_ids'], 
        attention_mask=attention_mask, 
        decoder_input_ids=encoder_input_ids, 
        max_length=max_len
    )
    
    # Decode the generated text
    generated_review = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    return generated_review


In [None]:
sample_reviews = reviews_df.sample(frac=0.001, random_state=42)  # Sample 10% of the reviews for demonstration

#tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
#input_tokenizer = T5Tokenizer.from_pretrained("t5-small")

business_id = 'IkY2ticzHEn4QFn8hQLSWg'  # Example business ID 
business_metadata = get_reviews_for_business(business_id, reviews_df)
business_metadata = combine_reviews_with_sep(business_metadata['text'], tokenizer)
#business_metadata = tokenize_reviews(business_metadata, tokenizer)
user_id = '_BcWyKQL16ndpBdggh2kNA'  # Example user ID
user_reviews = get_reviews_for_user(user_id, reviews_df)
#user_reviews = combine_reviews_with_sep(user_reviews['text'], tokenizer)
#user_reviews = tokenize_reviews(user_reviews, tokenizer)
# Initialize the model


review_text = combine_reviews_with_sep(sample_reviews['text'], tokenizer)
#review_text = tokenize_reviews(review_text, tokenizer)

prompt = "Generate a positive review for Geno's SteakHouse that mentions the food quality and service."
#prompt = input_tokenizer(prompt, return_tensors='pt', padding=True, truncation=True)


Separator token: None + count: 0
Separator token: None
Separator token: None + count: 0
Separator token: None


In [21]:
# Generate a review
generated_review = generate_review_with_t5(review_text, business_metadata, prompt, tokenizer, model)

print(f"Generated Review: {generated_review}")

ValueError: Input length of decoder_input_ids is 599, but `max_length` is set to 512. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.