In [None]:
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# Define the path to your single JSON review file
review_file = 'yelp_academic_dataset_review.json'  # Replace with your actual file name
user_file = 'yelp_academic_dataset_user.json'  # Replace with your actual file name
business_file = 'yelp_academic_dataset_business.json'  # Replace with your actual file name

# Load user and business data (assuming they don't change)
users_df = pd.read_json(user_file, lines=True)
business_df = pd.read_json(business_file, lines=True)

# Initialize an empty DataFrame to accumulate the data
final_dataset = pd.DataFrame()

# Define the batch size (number of records to process in each batch)
batch_size = 1000  # You can adjust this based on your system's memory

# Create a generator to read the review file in chunks
review_chunks = pd.read_json(review_file, lines=True, chunksize=batch_size)

# Define a function to process a single chunk
def process_chunk(chunk):
    # Merge the data based on common identifiers
    merged_df = chunk.merge(users_df[['user_id', 'review_count', 'average_stars']], on='user_id', how='left')
    merged_df = merged_df.merge(business_df[['business_id', 'stars', 'review_count']], on='business_id', how='left')

    # Select the desired features
    return merged_df[['stars_x', 'text', 'review_count_x', 'average_stars', 'stars_y', 'review_count_y']]

# Initialize a ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as executor:
    # Process each chunk in parallel
    futures = []
    processed_chunks = 0
    progress_bar = tqdm(total=processed_chunks, desc="Processing")

    for chunk in review_chunks:
        future = executor.submit(process_chunk, chunk)
        future.add_done_callback(lambda p: progress_bar.update(1))
        futures.append(future)
        processed_chunks += 1

    # Wait for all futures to complete
    for future in futures:
        final_dataset = final_dataset.append(future.result(), ignore_index=True)

    progress_bar.close()

# Rename the columns for clarity
final_dataset.columns = ['review_stars', 'review_text', 'user_review_count', 'user_average_stars', 'business_stars', 'business_review_count']

# Save the final dataset to a CSV file if needed
final_dataset.to_csv('final_dataset.csv', index=False)

In [None]:
import pandas as pd

# Load your final_dataset.csv (replace 'final_dataset.csv' with your actual file path)
final_dataset = pd.read_csv('final_dataset.csv')

# Initialize an empty DataFrame to store the balanced dataset
balanced_dataset = pd.DataFrame()

# Define the number of rows needed for each star rating
desired_count_per_rating = 4000

# Iterate through star ratings from 1 to 5
for star_rating in range(1, 6):
    # Filter rows with the current star rating
    filtered_rows = final_dataset[final_dataset['review_stars'] == star_rating]

    # Randomly sample rows to meet the desired count
    sampled_rows = filtered_rows.sample(n=desired_count_per_rating, replace=True)

    # Concatenate the sampled rows to the balanced dataset
    balanced_dataset = pd.concat([balanced_dataset, sampled_rows], ignore_index=True)

# Save the balanced dataset to a new CSV file
balanced_dataset.to_csv('balanced_dataset.csv', index=False)

In [None]:
# Remove rows with null values
balanced_dataset = balanced_dataset.dropna()

# Reset the index after removing rows
balanced_dataset.reset_index(drop=True, inplace=True)

# Display the updated DataFrame
print(balanced_dataset)