In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class ReviewDataset(Dataset):
    def __init__(self, reviews, tokenizer, max_len=512):
        self.reviews = reviews
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = self.reviews[item]
        
        # Tokenize the review text and pad/truncate it to the specified max length
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

In [3]:
import pandas as pd

reviews_df = pd.read_csv('data/test_raw_sample_reviews_PA.csv')
reviews_df = reviews_df.dropna()

reviews_df['text'] = reviews_df['text'].str.replace(r'http\S+', '', regex=True)
reviews_df['text'] = reviews_df['text'].str.replace(r'@\w+', '', regex=True)
reviews_df['text'] = reviews_df['text'].astype(str)
reviews_df['text'] = reviews_df['text'].apply(str)



In [4]:
business_id = 'IkY2ticzHEn4QFn8hQLSWg'  # Example business ID 
business_reviews = reviews_df[reviews_df['business_id'] == business_id]
business_reviews = business_reviews.reset_index(drop=True)

user_id = '_BcWyKQL16ndpBdggh2kNA'  # Example user ID
user_reviews = reviews_df[reviews_df['user_id'] == user_id]

prompt = "Generate a positive review for Geno's that mentions the food quality and service."

In [5]:
sample_df = reviews_df.sample(frac=0.05, random_state=42)

sample_df = pd.concat([sample_df, business_reviews, user_reviews], ignore_index=True)

reviews = sample_df['text'].tolist()
reviews = [str(review) for review in reviews]

# Example usage
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token  # Set the pad token to the end of sentence token

# Create the dataset
dataset = ReviewDataset(reviews, tokenizer)
dataloader = DataLoader(dataset, batch_size=2)


In [6]:
from torch.optim import AdamW
from tqdm import tqdm

# Initialize GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Fine-tuning loop
def train_gpt2(model, dataloader, optimizer, device, epochs=3):
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            optimizer.zero_grad()

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}")

# Train the model
train_gpt2(model, dataloader, optimizer, device)


Epoch 1/3:   0%|          | 0/46 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Epoch 1/3: 100%|██████████| 46/46 [02:04<00:00,  2.71s/it]


Epoch 1 - Average Loss: 1.4482


Epoch 2/3: 100%|██████████| 46/46 [01:57<00:00,  2.55s/it]


Epoch 2 - Average Loss: 0.9949


Epoch 3/3: 100%|██████████| 46/46 [01:57<00:00,  2.55s/it]

Epoch 3 - Average Loss: 0.9239





In [6]:
# Define the path to save the model
model_save_path = "./fine_tuned_gpt2"

# Save the trained model and tokenizer
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model and tokenizer saved to {model_save_path}")

NameError: name 'model' is not defined

In [7]:
# Define the path to save the model
model_save_path = "./fine_tuned_gpt2"
# Load the model and tokenizer from the saved directory
model = GPT2LMHeadModel.from_pretrained(model_save_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_save_path)

# Move model to the appropriate device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [12]:
# Define the path to save the model
model_save_path = "./fine_tuned_gpt2"
# Load the model and tokenizer from the saved directory
model = GPT2LMHeadModel.from_pretrained(model_save_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_save_path)

# Move model to the appropriate device (GPU or CPU)
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#model.to(device)

def generate_review(model, tokenizer, prompt, device, max_len=100):
    model.eval()  # Set the model to evaluation mode

    # Tokenize the prompt
    encoding = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=max_len)

    # Move the input to the device (GPU or CPU)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Generate text
    generated_ids = model.generate(
        input_ids=input_ids, 
        attention_mask=attention_mask, 
        max_length=max_len,
        num_return_sequences=1,  # Number of sequences to return
        no_repeat_ngram_size=4,  # Prevent repetition
        top_p=0.92,  # Nucleus sampling
        top_k=50,  # Top-k sampling
        temperature=0.7,  # Control randomness
    )

    # Decode the generated text
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    if generated_text.startswith(prompt):
        generated_text = generated_text[len(prompt):].strip()
    
    return generated_text

# Example prompt to generate a review
prompt = "Generate a negative review for Geno's Steaks that mentions the food quality and service."
#Example prompt to generate a review
generated_review = generate_review(model, tokenizer, prompt, device)

print(f"Generated Review: {generated_review}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Review: I'm not a fan of Geno's, but I'm not sure I'd order from them again. I'm a fan of the Steaks, and I'm not going to go back.


In [8]:
!pip install rouge-score
!pip install nltk


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=e6889ea420f0152233b80d12092d9de54e8c1aacfb1c966528f6a8c0b0583486
  Stored in directory: /Users/akshayshinde/Library/Caches/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[

In [5]:
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

def evaluate_generated_texts(generated_text, reference_texts):
    """
    Evaluates the generated text using BLEU and ROUGE scores.

    Parameters:
    - generated_text (str): The generated review text.
    - reference_texts (list of str): A list of reference (ground truth) review texts.
    
    Returns:
    - bleu_score (float): BLEU score between the generated and reference texts.
    - rouge_scores (dict): ROUGE scores (ROUGE-1, ROUGE-2, ROUGE-L) for the generated text.
    """
    
    # BLEU Score Calculation
    # Tokenize the generated text and reference texts
    reference_tokens = [reference.split() for reference in reference_texts]
    generated_tokens = generated_text.split()

    bleu_score = sentence_bleu(reference_tokens, generated_tokens)
    
    # ROUGE Score Calculation
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(' '.join(reference_texts), generated_text)

    return bleu_score, rouge_scores


# Evaluate the generated review
#bleu_score, rouge_scores = evaluate_generated_texts(generated_review, business_reviews['text'].tolist())

#print(f"BLEU Score: {bleu_score}")
#print(f"ROUGE Scores: {rouge_scores}")


In [6]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token  # Set the pad token to the end of sentence token

prompt = "Generate a positive review for Geno's that mentions the food quality and service."
generated_review = generate_review(model, tokenizer, prompt, device)

print(f"Generated Review: {generated_review}")



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


: 