# Big Data and Cloud Computing - Final Project

## Initial Data Loading

### Author:
Alen Pavlovic

The University of Chicago

In [1]:
import os
import subprocess
import datetime
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pyspark.sql import functions as F
from pyspark.sql.types import *

In [2]:
# Display settings
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [3]:
# Enable eager evaluation
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [5]:
# GCS path to the data
gcs_folder = 'gs://msca-bdp-data-open/final_project_reviews'

# Intermediate storage bucket - apavlovic
intermediate_bucket = 'gs://msca-bdp-students-bucket/shared_data/apavlovic/final_project'

In [6]:
# ---------------------------------------------------
# 1. CHECK DATA SIZE IN GCS
# ---------------------------------------------------

def check_folder_size(folder_path):
    """Check the size of a GCS folder"""
    cmd = f'gsutil du -s -h {folder_path}'
    
    p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
    for line in p.stdout.readlines():
        print(f'Total directory size: {line}')
    
    retval = p.wait()  # Wait for child process to terminate

In [7]:
print("Checking data size in GCS bucket...")
check_folder_size(gcs_folder)

Checking data size in GCS bucket...
Total directory size: 75.75 GiB    gs://msca-bdp-data-open/final_project_reviews



In [8]:
# ---------------------------------------------------
# 2. BASIC DATA LOADING
# ---------------------------------------------------

def load_reviews_sample(sample_fraction=0.001):
    """Load a small sample of the reviews data for exploration"""
    print(f"Loading {sample_fraction*100}% sample of reviews data...")
    
    # Read the entire reviews dataset
    df_reviews = spark.read.parquet(os.path.join(gcs_folder, 'reviews_parquet'))
    
    # Take a sample for exploration
    if sample_fraction < 1.0:
        df_reviews_sample = df_reviews.sample(fraction=sample_fraction, seed=42)
        print(f"Sample size: {df_reviews_sample.count():,} records")
        return df_reviews_sample
    else:
        print(f"Full dataset size: {df_reviews.count():,} records")
        return df_reviews
    
def load_meta_sample(sample_fraction=0.01):
    """Load a small sample of the metadata for exploration"""
    print(f"Loading {sample_fraction*100}% sample of metadata...")
    
    # Read the entire metadata dataset
    df_meta = spark.read.parquet(os.path.join(gcs_folder, 'meta_parquet'))
    
    # Take a sample for exploration
    if sample_fraction < 1.0:
        df_meta_sample = df_meta.sample(fraction=sample_fraction, seed=42)
        print(f"Sample size: {df_meta_sample.count():,} records")
        return df_meta_sample
    else:
        print(f"Full dataset size: {df_meta.count():,} records")
        return df_meta

In [9]:
reviews_sample = load_reviews_sample()
meta_sample = load_meta_sample()

Loading 0.1% sample of reviews data...


                                                                                

Sample size: 64,380 records
Loading 1.0% sample of metadata...




Sample size: 43,305 records


                                                                                

In [12]:
#reviews_sample
#meta_sample

In [17]:
# ---------------------------------------------------
# 3. EXPLORE DATA STRUCTURE
# ---------------------------------------------------

print("\n--- Reviews Data Schema ---")
reviews_sample.printSchema()

print("\n--- Metadata Schema ---")
meta_sample.printSchema()


--- Reviews Data Schema ---
root
 |-- asin: string (nullable = true)
 |-- helpful_vote: long (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- text: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- title: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- verified_purchase: boolean (nullable = true)


--- Metadata Schema ---
root
 |-- author: struct (nullable = true)
 |    |-- about: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- avatar: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- bought_together: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- main_category: string (nullable = true)
 |-- parent_asin: string (nullable = true)

In [41]:
# Print few rows
print("\n--- Sample Reviews Data ---")
reviews_sample.limit(5).show()

print("\n--- Sample Metadata ---")
meta_sample.limit(5).show()


--- Sample Reviews Data ---


                                                                                

+----------+------------+-----------+------+--------------------+-------------+--------------------+--------------------+-----------------+
|      asin|helpful_vote|parent_asin|rating|                text|    timestamp|               title|             user_id|verified_purchase|
+----------+------------+-----------+------+--------------------+-------------+--------------------+--------------------+-----------------+
|B09CBNY1SB|           2| B09CBNY1SB|   3.0|[[VIDEOID:3f9ba83...|1658512339071|           It's OkðŸ§¡|AGCFOUKPOYBQJFN4F...|            false|
|B00I3N6AM8|           0| B00I3N6AM8|   3.0|I have received t...|1461374668000|Reasonable Price ...|AFDOQTCP7SI36QPDC...|            false|
|B01KI40G3I|           0| B01LX6I5X0|   5.0|           Love it!!|1523381799799|          Five Stars|AEKIAS5INUOBSFLUH...|             true|
|B0058K1MJ0|           4| B0058K1MJ0|   5.0|I don't understan...|1621392460645|Why are they do e...|AE3G4X4NUNDMZESTV...|            false|
|B08R7P97F6|      

In [22]:
# ---------------------------------------------------
# 4. BASIC DATA EXPLORATION
# ---------------------------------------------------

print("\n--- Missing Values in Reviews ---")
for col in reviews_sample.columns:
    null_count = reviews_sample.filter(F.col(col).isNull()).count()
    total_count = reviews_sample.count()
    print(f"Column '{col}': {null_count:,} nulls ({null_count/total_count*100:.2f}%)")


--- Missing Values in Reviews ---


                                                                                

Column 'asin': 0 nulls (0.00%)


                                                                                

Column 'helpful_vote': 0 nulls (0.00%)


                                                                                

Column 'parent_asin': 0 nulls (0.00%)


                                                                                

Column 'rating': 0 nulls (0.00%)


                                                                                

Column 'text': 0 nulls (0.00%)


                                                                                

Column 'timestamp': 0 nulls (0.00%)


                                                                                

Column 'title': 0 nulls (0.00%)


                                                                                

Column 'user_id': 0 nulls (0.00%)




Column 'verified_purchase': 0 nulls (0.00%)


                                                                                

In [23]:
print("\n--- Review Date Distribution ---")
reviews_sample_with_date = reviews_sample.withColumn(
    "review_date", 
    F.to_date(F.from_unixtime(F.col("timestamp") / 1000))
)

date_dist = reviews_sample_with_date.groupBy(
    F.year("review_date").alias("year")
).count().orderBy("year")

date_dist.show()


--- Review Date Distribution ---




+----+-----+
|year|count|
+----+-----+
|2004|    1|
|2005|   11|
|2006|    6|
|2007|   11|
|2008|   37|
|2009|   60|
|2010|  105|
|2011|  232|
|2012|  529|
|2013| 1486|
|2014| 2545|
|2015| 4157|
|2016| 4946|
|2017| 5164|
|2018| 5840|
|2019| 8162|
|2020| 8875|
|2021| 9447|
|2022| 8938|
|2023| 3828|
+----+-----+



                                                                                

In [32]:
# ---------------------------------------------------
# 5. CREATE HELPER FUNCTIONS FOR FULL ANALYSIS
# ---------------------------------------------------

def create_intermediate_folder():
    """Create intermediate folder if it doesn't exist"""
    cmd = f'gsutil ls {intermediate_bucket}'
    try:
        p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        output, error = p.communicate()
        
        if p.returncode != 0:
            # Bucket/folder doesn't exist, create it
            cmd = f'gsutil mkdir -p {intermediate_bucket}'
            subprocess.run(cmd, shell=True)
            print(f"Created intermediate bucket: {intermediate_bucket}")
        else:
            print(f"Intermediate bucket already exists: {intermediate_bucket}")
    except Exception as e:
        print(f"Error checking/creating bucket: {str(e)}")

In [35]:
def process_full_reviews_data():
    """Process the full reviews dataset with basic transformations"""
    # Load full reviews dataset
    df_reviews = spark.read.parquet(os.path.join(gcs_folder, 'reviews_parquet'))
    
    # Basic preprocessing
    df_reviews_processed = df_reviews.withColumn(
        "review_date", 
        F.to_date(F.from_unixtime(F.col("timestamp") / 1000))
    )
    
    # Add year and month columns for time analysis
    df_reviews_processed = df_reviews_processed.withColumn("review_year", F.year("review_date"))
    df_reviews_processed = df_reviews_processed.withColumn("review_month", F.month("review_date"))
    
    # Filter out records with null essential fields
    df_reviews_clean = df_reviews_processed.filter(
        (F.col("asin").isNotNull()) & 
        (F.col("user_id").isNotNull()) &
        (F.col("review_date").isNotNull())
    )
    
    # Save processed data
    df_reviews_clean.write.mode("overwrite").parquet(f"{intermediate_bucket}/reviews_processed")
    print(f"Processed {df_reviews_clean.count():,} review records and saved to {intermediate_bucket}/reviews_processed")
    
    return df_reviews_clean

In [36]:
def process_full_meta_data():
    """Process the full metadata dataset with basic transformations"""
    # Load full metadata
    df_meta = spark.read.parquet(os.path.join(gcs_folder, 'meta_parquet'))
    
    # Basic preprocessing - clean up price field and explode categories
    df_meta_processed = df_meta.withColumn(
        "price_clean", 
        F.regexp_replace(F.col("price"), "\\$", "").cast("float")
    )
    
    # Ensure parent_asin is not null
    df_meta_clean = df_meta_processed.filter(F.col("parent_asin").isNotNull())
    
    # Save processed data
    df_meta_clean.write.mode("overwrite").parquet(f"{intermediate_bucket}/meta_processed")
    print(f"Processed {df_meta_clean.count():,} metadata records and saved to {intermediate_bucket}/meta_processed")
    
    return df_meta_clean

In [37]:
create_intermediate_folder()

Intermediate bucket already exists: gs://msca-bdp-students-bucket/shared_data/apavlovic/final_project


In [38]:
reviews_processed = process_full_reviews_data()



Processed 64,679,785 review records and saved to gs://msca-bdp-students-bucket/shared_data/apavlovic/final_project/reviews_processed


                                                                                

In [39]:
meta_processed = process_full_meta_data()

                                                                                

Processed 4,320,533 metadata records and saved to gs://msca-bdp-students-bucket/shared_data/apavlovic/final_project/meta_processed


                                                                                