In [1]:
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
# Import SparkSession class
# SparkSession is the entry point to use Spark SQL and DataFrame API
from pyspark.sql import SparkSession

# Import commonly used Spark SQL functions
# col   -> used to refer to a column in a DataFrame
# when  -> used for conditional logic (similar to if-else)
# count -> used for counting rows or values
# size  -> used to get the size of an array-type column
from pyspark.sql.functions import col, when, count, size


# Create a SparkSession object
# This starts a Spark application and connects to the Spark cluster
spark = SparkSession.builder\
    .appName("AmazonAppliancesIngestionNotebook").getOrCreate()                                   
# Name of the Spark application (useful for monitoring)
# Creates a new session or returns existing one


In [4]:
# Read the JSON file containing raw Amazon appliance reviews
# spark.read.json() loads the JSON data into a Spark DataFrame
# file:/// is used to read data from the local file system
reviews_df = spark.read.json(
    "file:///home/talentum/projects/AmazonReviewAnalytics/BigDataPipeline/data/raw/Appliances.json"
)

# Print the schema (structure) of the DataFrame
# This shows column names, data types, and whether fields are nullable
# Useful for understanding data before cleaning and processing
reviews_df.printSchema()


root
 |-- asin: string (nullable = true)
 |-- image: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- style: struct (nullable = true)
 |    |-- Color:: string (nullable = true)
 |    |-- Design:: string (nullable = true)
 |    |-- Flavor:: string (nullable = true)
 |    |-- Format:: string (nullable = true)
 |    |-- Item Package Quantity:: string (nullable = true)
 |    |-- Length:: string (nullable = true)
 |    |-- Package Quantity:: string (nullable = true)
 |    |-- Package Type:: string (nullable = true)
 |    |-- Pattern:: string (nullable = true)
 |    |-- Scent:: string (nullable = true)
 |    |-- Size Name:: string (nullable = true)
 |    |-- Size:: string (nullable = true)
 |    |-- Style Name:: string (nullable = true)
 |    |-- Style:: stri

In [5]:
# Select only the required columns from the original reviews DataFrame
# This helps in removing unnecessary data and reducing memory usage
reviews_df_clean = reviews_df.select(
    "reviewerID",        # Unique ID of the reviewer
    "verified",          # Indicates whether the purchase was verified (true/false)
    "asin",              # Amazon Standard Identification Number (product ID)
    "overall",           # Rating given by the user (1 to 5)
    "reviewText",        # Full text of the review
    "summary",           # Short summary/title of the review
    "unixReviewTime"     # Review time in Unix timestamp format
)

# Display the first 5 rows of the cleaned DataFrame
# Useful for quick validation of data after column selection
reviews_df_clean.show(5)


+--------------+--------+----------+-------+--------------------+--------------------+--------------+
|    reviewerID|verified|      asin|overall|          reviewText|             summary|unixReviewTime|
+--------------+--------+----------+-------+--------------------+--------------------+--------------+
|A3NHUQ33CFH3VM|   false|1118461304|    5.0|Not one thing in ...|Clear on what lea...|    1385510400|
|A3SK6VNBQDNBJE|   false|1118461304|    5.0|I have enjoyed Dr...|Becoming more inn...|    1383264000|
|A3SOFHUR27FO3K|   false|1118461304|    5.0|Alan Gregerman be...|The World from Di...|    1381363200|
|A1HOG1PYCAE157|   false|1118461304|    5.0|Alan Gregerman is...|Strangers are You...|    1381276800|
|A26JGAM6GZMM4V|   false|1118461304|    5.0|As I began to rea...|How and why it is...|    1378512000|
+--------------+--------+----------+-------+--------------------+--------------------+--------------+
only showing top 5 rows



In [6]:
# Function to generate a condition for detecting "empty" values
# It behaves differently based on the column data type
def get_empty_condition(column_name, data_type):

    # If the column is of String type
    # Empty value means an empty string ("")
    if data_type == 'string':
        return col(column_name) == ""

    # If the column is an Array type
    # Empty value means array size is 0 (no elements)
    elif 'array' in data_type:
        return size(col(column_name)) == 0

    # For all other data types (int, float, boolean, etc.)
    # These types cannot be empty, only NULL
    else:
        return col(column_name).isNull()


# Apply empty-value check dynamically on all columns
# reviews_df_clean.dtypes gives a list of (column_name, data_type)
# when() applies condition
# count() counts how many rows satisfy the condition
reviews_df_clean.select([
    count(
        when(get_empty_condition(c, t), c)
    ).alias(c)          # Column name is kept same for easy understanding
    for c, t in reviews_df_clean.dtypes
]).show()


# Count total number of rows in the DataFrame
# Useful for validation and comparison with empty/null counts
reviews_df_clean.count()


+----------+--------+----+-------+----------+-------+--------------+
|reviewerID|verified|asin|overall|reviewText|summary|unixReviewTime|
+----------+--------+----+-------+----------+-------+--------------+
|         0|       0|   0|      0|         0|      0|             0|
+----------+--------+----+-------+----------+-------+--------------+



602777

In [9]:
# Filter the reviews DataFrame to keep only verified purchases
# verified == "true" ensures we consider only genuine customer reviews
reviews_final = reviews_df_clean.filter(
    col("verified") == "true"
)

# Count the number of rows after filtering
# This helps in validating how many verified reviews are available
reviews_final.count()


563870

In [10]:
# Write the cleaned reviews DataFrame to disk in Parquet format
# Parquet is a columnar storage format and is efficient for big data processing
reviews_final.write.mode("overwrite").parquet(
        "file:///home/talentum/projects/AmazonReviewAnalytics/BigDataPipeline/data/cleaned/Reviews_Appliances_Parquet"
    )
