In [40]:
import pandas as pd
import json
import numpy as np
import ast
import re

In [41]:
def stream_and_combine_data(reviews_path, metadata_path):
    """Stream process large files without loading everything into memory"""
    
    # Load metadata into a dictionary - only need business_name
    metadata_dict = {}
    with open(metadata_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                try:
                    business = json.loads(line.strip())
                    metadata_dict[business.get('gmap_id')] = business.get('name', 'Unknown Business')
                except:
                    try:
                        business = ast.literal_eval(line.strip())
                        metadata_dict[business.get('gmap_id')] = business.get('name', 'Unknown Business')
                    except:
                        continue # Skip malformed lines
    
    # Stream process reviews
    combined_data = []
    with open(reviews_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                try:
                    review = json.loads(line.strip())
                    gmap_id = review.get('gmap_id')
                    business_name = metadata_dict.get(gmap_id, 'Unknown Business')
                    
                    combined_data.append({
                        'business_name': business_name,
                        'user_name': review.get('name', 'Anonymous User'),
                        'rating': review.get('rating'),
                        'text': review.get('text', '')
                    })
                except:
                    try:
                        review = ast.literal_eval(line.strip())
                        gmap_id = review.get('gmap_id')
                        business_name = metadata_dict.get(gmap_id, 'Unknown Business')
                        
                        combined_data.append({
                            'business_name': business_name,
                            'user_name': review.get('name', 'Anonymous User'),
                            'rating': review.get('rating'),
                            'text': review.get('text', '')
                        })
                    except:
                        continue # Skip malformed lines
    
    return pd.DataFrame(combined_data)

In [42]:
def remove_duplicate_reviews(df):
    initial_count = len(df)
    
    # Drop exact duplicates
    df = df.drop_duplicates(subset=['user_name', 'business_name', 'text_clean'])
    print(f"Removed {initial_count - len(df)} duplicate reviews")

    return df

In [43]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    # Clean review text
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r"[^\w\s.,!?@#$%&*()\-\']", "", text) # Remove special chars but keep basic punctuation
    
    return text

In [44]:
def clean_review_data(df):
    # Create a copy to avoid modifying the original
    df_clean = df.copy()
    
    # Handle missing values
    df_clean['text'] = df_clean['text'].fillna('').astype(str)
    df_clean['user_name'] = df_clean['user_name'].fillna('Anonymous User')
    df_clean['business_name'] = df_clean['business_name'].fillna('Unknown Business')
    
    # Remove rows with missing ratings
    initial_count = len(df_clean)
    df_clean = df_clean.dropna(subset=['rating'])
    print(f"Removed {initial_count - len(df_clean)} rows with missing ratings")
    
    # Clean text data
    df_clean['text_clean'] = df_clean['text'].apply(clean_text)
    
    # Convert rating to integer and handle outliers
    df_clean['rating'] = pd.to_numeric(df_clean['rating'], errors='coerce')

    # Ensure ratings are between 1-5
    df_clean = df_clean[(df_clean['rating'] >= 1) & (df_clean['rating'] <= 5)]
    df_clean['rating'] = df_clean['rating'].astype(int)
    
    # Clean user and business names
    df_clean['user_name'] = df_clean['user_name'].str.strip()
    df_clean['business_name'] = df_clean['business_name'].str.strip()
    
    # Remove duplicates
    df_clean = remove_duplicate_reviews(df_clean)
    
    print(f"Final clean dataset shape: {df_clean.shape}")
    return df_clean

In [45]:
df = stream_and_combine_data('review-other.json', 'meta-other.json')
print(f"Initial combined dataset shape: {df.shape}")
df.head()

Initial combined dataset shape: (162952, 4)


Unnamed: 0,business_name,user_name,rating,text
0,Pawtastic Cuts Mobile Grooming,Amber Thibeault,5,Andrea is amazing. Our dog loves her and she a...
1,Pawtastic Cuts Mobile Grooming,Esther,5,Andrea does a wonderful job with our wild Pr...
2,Pawtastic Cuts Mobile Grooming,Bob Barrett,1,Never called back
3,Pawtastic Cuts Mobile Grooming,Luz Quiles,3,They don't answer the phones
4,Pawtastic Cuts Mobile Grooming,Tim Sanderson,3,Limited information on the website


In [46]:
df_clean = clean_review_data(df)
df_clean.head()

Removed 0 rows with missing ratings
Removed 7928 duplicate reviews
Final clean dataset shape: (155024, 5)


Unnamed: 0,business_name,user_name,rating,text,text_clean
0,Pawtastic Cuts Mobile Grooming,Amber Thibeault,5,Andrea is amazing. Our dog loves her and she a...,Andrea is amazing. Our dog loves her and she a...
1,Pawtastic Cuts Mobile Grooming,Esther,5,Andrea does a wonderful job with our wild Pr...,Andrea does a wonderful job with our wild Prin...
2,Pawtastic Cuts Mobile Grooming,Bob Barrett,1,Never called back,Never called back
3,Pawtastic Cuts Mobile Grooming,Luz Quiles,3,They don't answer the phones,They don't answer the phones
4,Pawtastic Cuts Mobile Grooming,Tim Sanderson,3,Limited information on the website,Limited information on the website
