## Data Ingestion and Labelling

In [2]:
import json

In [3]:
# File path for the review dataset
review_path = "yelp_academic_dataset_review.json"

In [4]:
# Function to load the review dataset in chunks
def load_review_dataset(file_path, chunk_size=10000):
    print(f"Loading {file_path}...")
    data = []
    
    # Read the JSON file line by line
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            data.append(json.loads(line))  # Load each line as a dictionary
            if (i + 1) % chunk_size == 0:  # Print progress every chunk_size lines
                print(f"Loaded {i + 1} lines...")
    
    print(f"Completed loading {file_path}. Total lines: {len(data)}")
    return data  # Return the dataset as a list of dictionaries

In [5]:
# Load the review dataset
review_data = load_review_dataset(review_path)

# Display the first record to verify
print("First record in the review dataset:")
print(review_data[0])

Loading yelp_academic_dataset_review.json...
Loaded 10000 lines...
Loaded 20000 lines...
Loaded 30000 lines...
Loaded 40000 lines...
Loaded 50000 lines...
Loaded 60000 lines...
Loaded 70000 lines...
Loaded 80000 lines...
Loaded 90000 lines...
Loaded 100000 lines...
Loaded 110000 lines...
Loaded 120000 lines...
Loaded 130000 lines...
Loaded 140000 lines...
Loaded 150000 lines...
Loaded 160000 lines...
Loaded 170000 lines...
Loaded 180000 lines...
Loaded 190000 lines...
Loaded 200000 lines...
Loaded 210000 lines...
Loaded 220000 lines...
Loaded 230000 lines...
Loaded 240000 lines...
Loaded 250000 lines...
Loaded 260000 lines...
Loaded 270000 lines...
Loaded 280000 lines...
Loaded 290000 lines...
Loaded 300000 lines...
Loaded 310000 lines...
Loaded 320000 lines...
Loaded 330000 lines...
Loaded 340000 lines...
Loaded 350000 lines...
Loaded 360000 lines...
Loaded 370000 lines...
Loaded 380000 lines...
Loaded 390000 lines...
Loaded 400000 lines...
Loaded 410000 lines...
Loaded 420000 lines..

In [6]:
# Filter out relevant columns: text, stars, cool, funny, useful
filtered_reviews = [
    {
        "text": record["text"],
        "stars": record["stars"],
        "cool": record["cool"],
        "funny": record["funny"],
        "useful": record["useful"]
    }
    for record in review_data
]

# Display the first filtered record
print(f"Filtered reviews: {len(filtered_reviews)} records")
print("First filtered record:")
print(filtered_reviews[0])

Filtered reviews: 6990280 records
First filtered record:
{'text': "If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \n\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker.", 'stars': 3.0, 'cool': 0, 'funny': 0, 'useful': 0}


In [7]:
# Function to label sentiments
def label_sentiment(stars):
    if stars >= 4:
        return "Positive"
    elif stars == 3:
        return "Neutral"
    else:
        return "Negative"

# Add sentiment labels
for review in filtered_reviews:
    review["sentiment"] = label_sentiment(review["stars"])

# Display the first record with sentiment
print("First record with sentiment:")
print(filtered_reviews[0])

First record with sentiment:
{'text': "If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \n\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker.", 'stars': 3.0, 'cool': 0, 'funny': 0, 'useful': 0, 'sentiment': 'Neutral'}


## Data Preprocessing

In [9]:
from transformers import AutoTokenizer

# Load a pre-trained tokenizer (BERT tokenizer)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
print("Tokenizer loaded successfully!")

Tokenizer loaded successfully!


In [10]:
def preprocess_text_with_tokenizer(batch):
    # Tokenize the batch of text on the GPU
    tokenized = tokenizer(
        batch,
        padding=True,          # Pad sentences to the same length
        truncation=True,       # Truncate long sentences
        max_length=128,        # Limit sequence length to 128 tokens
        return_tensors="pt"    # Return PyTorch tensors
    ).to("cuda")              # Move the result to the GPU
    return tokenized

In [11]:
import random

# Define subset size
subset_size = 1000000

# Randomly sample a subset of the data
subset_reviews = random.sample(filtered_reviews, subset_size)

# Display the size of the subset
print(f"Subset size: {len(subset_reviews)} reviews")
print("First review in the subset:")
print(subset_reviews[0])

Subset size: 1000000 reviews
First review in the subset:
{'text': 'Good restaurant.  We have been there many times and the food is consistently good.  Great decor, beautiful bar.  During warmer weather. A really nice outside area right off the bar.', 'stars': 3.0, 'cool': 0, 'funny': 0, 'useful': 0, 'sentiment': 'Neutral'}
