## Data Ingestion and Labelling

In [1]:
import json

In [2]:
# File path for the review dataset
review_path = "yelp_academic_dataset_review.json"

In [10]:
# Function to load the review dataset without excessive logs
def load_review_dataset(file_path, chunk_size=10000):
    print(f"Loading {file_path}...")
    data = []
    
    # Read the JSON file line by line
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            data.append(json.loads(line))
    
    print(f"Completed loading {file_path}. Total lines: {len(data)}")
    return data  # Return the dataset as a list of dictionaries

In [11]:
# Load the review dataset
review_data = load_review_dataset(review_path)

# Display the first record to verify
print("First record in the review dataset:")
print(review_data[0])

Loading yelp_academic_dataset_review.json...
Completed loading yelp_academic_dataset_review.json. Total lines: 6990280
First record in the review dataset:
{'review_id': 'KU_O5udG6zpxOg-VcAEodg', 'user_id': 'mh_-eMZ6K5RLWhZyISBhwA', 'business_id': 'XQfwVwDr-v0ZS3_CbbE5Xw', 'stars': 3.0, 'useful': 0, 'funny': 0, 'cool': 0, 'text': "If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \n\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker.", 'date': '2018-07-07 22:09:11'}


In [16]:
# Filter out relevant columns: text, stars, cool, funny, useful
filtered_reviews = [
    {
        "text": record["text"],
        "stars": record["stars"],
        "cool": record["cool"],
        "funny": record["funny"],
        "useful": record["useful"]
    }
    for record in review_data
]

# Display the first filtered record
print(f"Filtered reviews: {len(filtered_reviews)} records")
print("First filtered record:")
print(filtered_reviews[0])

Filtered reviews: 6990280 records
First filtered record:
{'text': "If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \n\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker.", 'stars': 3.0, 'cool': 0, 'funny': 0, 'useful': 0}


In [17]:
# Function to label sentiments
def label_sentiment(stars):
    if stars >= 4:
        return "Positive"
    elif stars == 3:
        return "Neutral"
    else:
        return "Negative"

# Add sentiment labels
for review in filtered_reviews:
    review["sentiment"] = label_sentiment(review["stars"])

# Display the first record with sentiment
print("First record with sentiment:")
print(filtered_reviews[0])

First record with sentiment:
{'text': "If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \n\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker.", 'stars': 3.0, 'cool': 0, 'funny': 0, 'useful': 0, 'sentiment': 'Neutral'}


## Data Preprocessing

In [28]:
from transformers import AutoTokenizer
import torch

# Load a pre-trained tokenizer (BERT tokenizer)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
print("Tokenizer loaded successfully!")

Tokenizer loaded successfully!


In [20]:
def preprocess_text_with_tokenizer(batch):
    # Tokenize the batch of text on the GPU
    tokenized = tokenizer(
        batch,
        padding=True,          # Pad sentences to the same length
        truncation=True,       # Truncate long sentences
        max_length=128,        # Limit sequence length to 128 tokens
        return_tensors="pt"    # Return PyTorch tensors
    ).to("cuda")              # Move the result to the GPU
    return tokenized

In [21]:
import random

# Define subset size
subset_size = 1000000

# Randomly sample a subset of the data
subset_reviews = random.sample(filtered_reviews, subset_size)

# Display the size of the subset
print(f"Subset size: {len(subset_reviews)} reviews")
print("First review in the subset:")
print(subset_reviews[0])

Subset size: 1000000 reviews
First review in the subset:
{'text': 'Hibachi Station was delicious and hot and fresh clean restaurant.. stacked bar... Clean restroom', 'stars': 3.0, 'cool': 0, 'funny': 0, 'useful': 0, 'sentiment': 'Neutral'}


In [30]:
# Define batch size
batch_size = 100000

# Extract text data from the subset
texts = [review["text"] for review in subset_reviews]

# Process the text in batches
processed_batches = []
for i in range(0, len(texts), batch_size):
    print(f"Processing batch {i // batch_size + 1} out of {len(texts) // batch_size + 1}...")
    batch = texts[i:i + batch_size]
    tokenized_batch = preprocess_text_with_tokenizer(batch)  # Tokenize with GPU acceleration
    processed_batches.append(tokenized_batch["input_ids"])  # Only store input IDs

# Flatten the processed batches into a single tensor
processed_tensors = torch.cat(processed_batches, dim=0)

# Store processed tensors back into the subset_reviews dataset
for i, review in enumerate(subset_reviews):
    review["processed_text"] = processed_tensors[i].tolist()  # Convert tensors to lists for storage
    
# Display the first record with processed text
print("First record after GPU-accelerated tokenization:")
print(subset_reviews[0])

Processing batch 1 out of 11...
Processing batch 2 out of 11...
Processing batch 3 out of 11...
Processing batch 4 out of 11...
Processing batch 5 out of 11...
Processing batch 6 out of 11...
Processing batch 7 out of 11...
Processing batch 8 out of 11...
Processing batch 9 out of 11...
Processing batch 10 out of 11...
First record after GPU-accelerated tokenization:
{'text': 'Hibachi Station was delicious and hot and fresh clean restaurant.. stacked bar... Clean restroom', 'stars': 3.0, 'cool': 0, 'funny': 0, 'useful': 0, 'sentiment': 'Neutral', 'processed_text': [101, 7632, 7693, 2072, 2276, 2001, 12090, 1998, 2980, 1998, 4840, 4550, 4825, 1012, 1012, 16934, 3347, 1012, 1012, 1012, 4550, 28249, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

### Stopword Removal

In [41]:
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords
nltk.download("stopwords")

# Load English stopwords
stop_words = set(stopwords.words("english"))

# Convert stopwords to token IDs
stopword_ids = tokenizer.convert_tokens_to_ids(list(stop_words))

# Remove `None` values for words not in the tokenizer's vocabulary
stopword_ids = [token_id for token_id in stopword_ids if token_id is not None]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chetn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
from torch.nn.utils.rnn import pad_sequence

# Function to remove stopwords and pad sequences
def remove_stopwords_and_pad(tokenized_batch, max_length=128):
    token_ids = tokenized_batch["input_ids"]
    
    # Remove stopwords from each sequence
    filtered_tokens = [
        [token for token in seq if token not in stopword_ids] for seq in token_ids
    ]
    
    # Convert to PyTorch tensors
    filtered_tensors = [torch.tensor(seq) for seq in filtered_tokens]
    
    # Pad sequences to max_length
    padded_tensors = pad_sequence(filtered_tensors, batch_first=True, padding_value=tokenizer.pad_token_id)
    
    # Truncate sequences to the max_length
    return padded_tensors[:, :max_length]

In [49]:
# Apply stopword removal and padding to all batches
processed_batches_no_stopwords = [
    remove_stopwords_and_pad({"input_ids": batch.tolist()}, max_length=128).to("cuda")
    for batch in processed_batches
]

# Flatten the processed batches without stopwords into a single tensor
processed_tensors_no_stopwords = torch.cat(processed_batches_no_stopwords, dim=0)

# Update the subset_reviews dataset with stopword-removed tokens
for i, review in enumerate(subset_reviews):
    review["processed_text_no_stopwords"] = processed_tensors_no_stopwords[i].tolist()

# Display the first record with stopwords removed
print("First record after stopword removal and padding:")
print(subset_reviews[0])

First record after stopword removal and padding:
{'text': 'Hibachi Station was delicious and hot and fresh clean restaurant.. stacked bar... Clean restroom', 'stars': 3.0, 'cool': 0, 'funny': 0, 'useful': 0, 'sentiment': 'Neutral', 'processed_text': [101, 7632, 7693, 2072, 2276, 2001, 12090, 1998, 2980, 1998, 4840, 4550, 4825, 1012, 1012, 16934, 3347, 1012, 1012, 1012, 4550, 28249, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'processed_text_no_stopwords': [101, 7632, 7693, 2072, 2276, 12090, 2980, 4840, 4550, 4825, 1012, 1012, 16934, 3347, 1012, 1012, 1012, 4550, 28249, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## Processed Data

In [51]:
# Define sentiment mapping
sentiment_mapping = {"Positive": 2, "Neutral": 1, "Negative": 0}

# Convert labels
for review in subset_reviews:
    review["label"] = sentiment_mapping[review["sentiment"]]

In [55]:
# Save the labeled dataset as JSON
with open("labeled_reviews.json", "w", encoding="utf-8") as f:
    json.dump(subset_reviews, f, ensure_ascii=False, indent=4)

print("Dataset saved as labeled_reviews.json")

Dataset saved as labeled_reviews.json


In [60]:
# Display the total number of records in subset_reviews
print(f"Total number of records in subset_reviews: {len(subset_reviews)}")

Total number of records in subset_reviews: 1000000
