In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# DATA_SET_ZIP = "/kaggle/input/yelp-full/file.zip
# DEST = "/kaggle/input/yelp-full/unzipped/"
# os.makdirs(DEST)
# # Unwip dataset
# import zipfile
# with zipfile.ZipFile(DATA_SET_ZIP, 'r') as zip_ref:
#     zip_ref.extractall(DEST)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/large-yaml/reviews2.pkl
/kaggle/input/yelp-small/yelp_subset_review.json
/kaggle/input/150-epochs-gen-text/keras/default/1/yelp_review_generator-43 perplexity.h5


In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json
import numpy as np

DATA_SET = "/kaggle/input/large-yaml/reviews2.pkl"
MODEL = "/kaggle/input/150-epochs-gen-text/keras/default/1/yelp_review_generator-43 perplexity.h5"

# Load your Yelp reviews dataset
def load_yelp_reviews():
    """Load the review data from JSON file or JSON Lines format"""
    df = pd.read_pickle(DATA_SET)

    print(f"\nData Loading Summary:")
    print(f"Total reviews loaded: {len(df)}")
    print(f"Columns available: {df.columns.tolist()}")


# Load the reviews
reviews = load_yelp_reviews()
print(reviews[:10])

# Tokenize the text
max_words = 5000  # Maximum vocabulary size
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(reviews)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1  # Add 1 for padding token
MAX_LEN = 100

# Create sequences
input_sequences = []


for review in reviews:
    tokens = tokenizer.texts_to_sequences([review])[0][:MAX_LEN]  # Limit input
    for i in range(1, len(tokens)):
        input_sequences.append(tokens[: i + 1])


# Pad sequences
input_sequences = pad_sequences(input_sequences, maxlen=MAX_LEN, padding="pre")
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
max_sequence_len = MAX_LEN

# Define model architecture
embedding_dim = 100

if os.path.exists(MODEL):
    print(f"Loading model from: {MODEL}")
    model = tf.keras.models.load_model(MODEL)
else:
    print("Model not found, creating a new one.")
    model = Sequential(
        [
            Embedding(max_words, 64, input_length=MAX_LEN - 1),
            LSTM(64, return_sequences=True),
            Dropout(0.2),
            LSTM(32),
            Dropout(0.2),
            Dense(max_words, activation="softmax"),
        ]
    )
    model.build(input_shape=(None, max_sequence_len - 1))

In [8]:
epochs = 900
# Train model
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)
model.summary()

# Train the model
with tf.device("/GPU:0"):  # On MPS, GPU:0 is the Metal device
    history = model.fit(X, y, epochs=epochs, batch_size=1024, verbose=1)

# Save the model
model.save(f"yelp_review_generator-epochs_{epochs}.h5")

[1m714/714[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 42ms/step - accuracy: 0.2104 - loss: 4.3579


In [9]:
# Generate new text
def generate(
    seed_text, model, tokenizer, max_sequence_len, temperature=1.0, max_length=100
):
    """
    Generate text that continues from the seed_text until an end-of-sentence token is reached.

    Args:
        seed_text (str): The starting text to continue from
        model: The trained model
        tokenizer: The tokenizer used during training
        max_sequence_len (int): Maximum sequence length used during training
        temperature (float): Controls randomness in generation. Higher values increase diversity.
        max_length (int): Maximum length of generated text to prevent infinite loops

    Returns:
        str: The seed_text plus the generated continuation
    """
    # End of sentence tokens
    eos_tokens = [".", "!", "?"]

    # Current text is the seed text
    current_text = seed_text

    # Counter to prevent infinite loops
    counter = 0

    # Generate text until EOS token or max_length is reached
    while counter < max_length:
        # Tokenize the current text
        token_list = tokenizer.texts_to_sequences([current_text])[0]

        # Pad the sequence
        token_list = pad_sequences(
            [token_list], maxlen=max_sequence_len - 1, padding="pre"
        )

        # Get model prediction (probabilities for next word)
        predicted_probs = model.predict(token_list, verbose=0)[0]

        # Apply temperature to adjust prediction diversity
        predicted_probs = np.log(predicted_probs) / temperature
        exp_preds = np.exp(predicted_probs)
        predicted_probs = exp_preds / np.sum(exp_preds)

        # Sample from the probability distribution
        predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)

        # Get the corresponding word
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break

        # Add the predicted word to the current text
        if output_word != "":
            current_text += " " + output_word

            # Check if the last character is an end-of-sentence token
            if output_word[-1] in eos_tokens:
                break

        counter += 1

    return current_text

# Example usage
seed_text = "The restaurant atmosphere was"
generated_text = generate(seed_text, model, tokenizer, max_sequence_len)
print(generated_text)

The restaurant atmosphere was <OOV> i asked too again but won't have to come back because of the only the service was really good but i was particularly excited with a friendly atmosphere and service when i had in awhile with tiny low environment wishing after we were looking for <OOV> i know because you can have a wait because they do not make them that a lot of place to spend we had no beef as we don't understand that it took another friends with a dinner party <OOV> with when the wait on gulf all eating inside or we were even <OOV>


In [10]:
import numpy as np

def compute_perplexity(model, X, y):
    """Compute perplexity over a dataset"""
    y_pred = model.predict(X, verbose=0)
    probas = y_pred[np.arange(len(y)), y]
    log_probs = -np.log(probas + 1e-10)  # avoid log(0)
    perplexity = np.exp(np.mean(log_probs))
    return perplexity

# Example use
perplexity_score = compute_perplexity(model, X[:1000], y[:1000])  # use a sample to reduce memory
print(f"Perplexity: {perplexity_score:.2f}")


Perplexity: 44.74


Perplexity: 44.21 After 150 epoches


In [11]:
!pip install rouge_score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def compute_bleu(reference_text, generated_text):
    """
    Compute BLEU score between reference and generated text.
    Uses smoothing to avoid zero scores for short sequences.

    Returns a dictionary with BLEU-1 to BLEU-4 scores.
    """
    reference = [reference_text.split()]  # reference must be a list of list of tokens
    hypothesis = generated_text.split()   # generated text tokens

    smoothie = SmoothingFunction().method4

    return {
        "BLEU-1": sentence_bleu(reference, hypothesis, weights=(1, 0, 0, 0), smoothing_function=smoothie),
        "BLEU-2": sentence_bleu(reference, hypothesis, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie),
        "BLEU-3": sentence_bleu(reference, hypothesis, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie),
        "BLEU-4": sentence_bleu(reference, hypothesis, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie),
    }


def compute_rouge(reference_text, generated_text):
    """
    Compute ROUGE scores between reference and generated text.
    Returns a dictionary of ROUGE-1, ROUGE-2, and ROUGE-L scores.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text, generated_text)
    return {
        'ROUGE-1': scores['rouge1'].fmeasure,
        'ROUGE-2': scores['rouge2'].fmeasure,
        'ROUGE-L': scores['rougeL'].fmeasure,
    }

rouge_totals = {'ROUGE-1': 0, 'ROUGE-2': 0, 'ROUGE-L': 0}
bleu_totals = {'BLEU-1': 0, 'BLEU-2': 0, 'BLEU-3': 0, 'BLEU-4': 0}
num_samples = 10

for i in range(num_samples):
    original = reviews[i]
    seed = " ".join(original.split()[:4])
    generated = generate(seed, model, tokenizer, max_sequence_len)
    
    rouge_scores = compute_rouge(original, generated)
    bleu_scores = compute_bleu(original, generated)
    
    print(f"\nSeed: {seed}")
    print(f"Generated: {generated}")
    print(f"Reference: {original}")
    
    print("ROUGE scores:")
    for k, v in rouge_scores.items():
        print(f"  {k}: {v:.4f}")
        rouge_totals[k] += v
    
    print("BLEU scores:")
    for k, v in bleu_scores.items():
        print(f"  {k}: {v:.4f}")
        bleu_totals[k] += v

# Compute and print means
print("\n--- AVERAGE ROUGE SCORES ---")
for k in rouge_totals:
    print(f"{k}: {rouge_totals[k]/num_samples:.4f}")

print("\n--- AVERAGE BLEU SCORES ---")
for k in bleu_totals:
    print(f"{k}: {bleu_totals[k]/num_samples:.4f}")



Seed: My girlfriend and I
Generated: My girlfriend and I went on the first night to take serving the large good part and the food was amazing and we had the sandwich <OOV> a big piece of bread my husband ordered the grouper and it wasn't as much bowl about the white salad i have never experienced i try the appetizers my meal was pretty bland and pointing to and we ordered a open sauce from the <OOV> picture of my orleans a thursday while in far from the corner to the five tables and google the smell which was an great bar no corkage platter of the worst items
Reference: My girlfriend and I stopped by in Boise for a night and decided to give the fork a try so keep in mind it was our first in only time. Ordered tomato bisque fondue with grilled cheese as an appetizer and that shit went hard as a mothafucka. Shared the coconut rum and curry braised ribs and that she went hard too! Like calm the fuck down y'all! I'm like yoooooo this shit is fuckin delicious
ROUGE scores:
  ROUGE-1: 0.295

### After 150 epoches
##### --- AVERAGE ROUGE SCORES ---
ROUGE-1: 0.2500
ROUGE-2: 0.0608
ROUGE-L: 0.1620

##### --- AVERAGE BLEU SCORES ---
BLEU-1: 0.1559
BLEU-2: 0.0736
BLEU-3: 0.0480
BLEU-4: 0.0307