In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.appName("AmazonApplicationIngestionNotebook").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [3]:
metadata_df = spark.read.json("file:///home/talentum/projects/AmazonReviewAnalytics/BigdataPipeline/data/raw/meta_Appliances.json")

In [4]:
metadata_df.printSchema()

root
 |-- also_buy: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- also_view: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- asin: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- date: string (nullable = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- details: struct (nullable = true)
 |    |-- 
    Item Weight: 
    : string (nullable = true)
 |    |-- 
    Product Dimensions: 
    : string (nullable = true)
 |    |-- ASIN:: string (nullable = true)
 |    |-- ASIN: : string (nullable = true)
 |    |-- Batteries: string (nullable = true)
 |    |-- Domestic Shipping: : string (nullable = true)
 |    |-- International Shipping: : string (nullable = true)
 |    |-- Item model number:: string (nullable = true)
 |    |-- Publisher:: string (nullable = true)
 |    |-- Shipping

In [5]:
metadata_required = metadata_df.select(
    "asin",
    "title",
    "brand",
    "category",
    "price")
metadata_required.show(10)

+----------+--------------------+--------------+--------------------+--------------------+
|      asin|               title|         brand|            category|               price|
+----------+--------------------+--------------+--------------------+--------------------+
|7301113188|Tupperware Freeze...|    Tupperware|[Appliances, Refr...|                    |
|7861850250|2 X Tupperware Pu...|    Tupperware|[Appliances, Refr...|               $3.62|
|8792559360|The Cigar - Momen...|The Cigar Book|[Appliances, Part...|             $150.26|
|9792954481|Caraselle 2X 50G ...|     Caraselle|[Appliances, Part...|.a-box-inner{back...|
|B00002N5EL|EATON Wiring 39CH...|  EATON Wiring|[Appliances, Part...|               $3.43|
|B00002N7HY|Leviton 5050 B01-...|       Leviton|[Appliances, Part...|               $6.98|
|B00002N7IL|Leviton 5206 50 A...|       Leviton|[Appliances, Part...|                    |
|B00002N9OE|Amerock BP5322-BJ...|       Amerock|[Appliances, Part...|               $7.68|

In [6]:
from pyspark.sql.functions import col, when, count, size

def get_empty_condition(column_name, data_type):
    # If it's a standard String column
    if data_type == 'string':
        return col(column_name) == ""
    # If it's an Array (like your 'category' column)
    elif 'array' in data_type:
        return size(col(column_name)) == 0
    # For other types (int, float, etc.), they can't be "empty", only null
    else:
        return col(column_name).isNull()

# Apply the logic dynamically to meta_df_safe
metadata_required.select([count(when(get_empty_condition(c, t), c)).alias(c) for c, t in metadata_required.dtypes]).show()
print(metadata_required.count())

+----+-----+-----+--------+-----+
|asin|title|brand|category|price|
+----+-----+-----+--------+-----+
|   0|    0|  584|     806|10292|
+----+-----+-----+--------+-----+

30445


In [7]:
#we have removed/drop the price column as there are many null values, empty string etc

In [8]:
metadata_required = metadata_required.drop("price")

In [9]:
metadata_required.show(10)

+----------+--------------------+--------------+--------------------+
|      asin|               title|         brand|            category|
+----------+--------------------+--------------+--------------------+
|7301113188|Tupperware Freeze...|    Tupperware|[Appliances, Refr...|
|7861850250|2 X Tupperware Pu...|    Tupperware|[Appliances, Refr...|
|8792559360|The Cigar - Momen...|The Cigar Book|[Appliances, Part...|
|9792954481|Caraselle 2X 50G ...|     Caraselle|[Appliances, Part...|
|B00002N5EL|EATON Wiring 39CH...|  EATON Wiring|[Appliances, Part...|
|B00002N7HY|Leviton 5050 B01-...|       Leviton|[Appliances, Part...|
|B00002N7IL|Leviton 5206 50 A...|       Leviton|[Appliances, Part...|
|B00002N9OE|Amerock BP5322-BJ...|       Amerock|[Appliances, Part...|
|B00002NARC|Leviton 5207 125/...|       Leviton|[Appliances, Part...|
|B00004SQHD|Coleman Cable 090...| Coleman Cable|[Appliances, Part...|
+----------+--------------------+--------------+--------------------+
only showing top 10 

In [10]:
from pyspark.sql.functions import col, when, count, size

def get_empty_condition(column_name, data_type):
    # If it's a standard String column
    if data_type == 'string':
        return col(column_name) == ""
    # If it's an Array (like your 'category' column)
    elif 'array' in data_type:
        return size(col(column_name)) == 0
    # For other types (int, float, etc.), they can't be "empty", only null
    else:
        return col(column_name).isNull()

# Apply the logic dynamically to metadata_required
metadata_required.select([count(when(get_empty_condition(c, t), c)).alias(c) for c, t in metadata_required.dtypes]).show()
print("Cleaned row count :", metadata_required.count())

+----+-----+-----+--------+
|asin|title|brand|category|
+----+-----+-----+--------+
|   0|    0|  584|     806|
+----+-----+-----+--------+

Cleaned row count : 30445


In [11]:
from pyspark.sql.functions import col, when, count, size
from functools import reduce
from pyspark.sql import DataFrame

def get_empty_condition(column_name, data_type):
    if data_type == 'string':
        return (col(column_name) == "") | col(column_name).isNull()
    elif 'array' in data_type:
        return (size(col(column_name)) == 0) | col(column_name).isNull()
    else:
        return col(column_name).isNull()

# Build OR condition: row is invalid if ANY column is empty/null
invalid_row_condition = reduce(
    lambda a, b: a | b,
    [get_empty_condition(c, t) for c, t in metadata_required.dtypes]
)

# Drop invalid rows
metadata_cleaned = metadata_required.filter(~invalid_row_condition)

# Results
metadata_cleaned.show(10)

metadata_cleaned.select([count(when(get_empty_condition(c, t), c)).alias(c) for c, t in metadata_required.dtypes]).show()

print("Original row count:", metadata_required.count())
print("Cleaned row count :", metadata_cleaned.count())

+----------+--------------------+--------------+--------------------+
|      asin|               title|         brand|            category|
+----------+--------------------+--------------+--------------------+
|7301113188|Tupperware Freeze...|    Tupperware|[Appliances, Refr...|
|7861850250|2 X Tupperware Pu...|    Tupperware|[Appliances, Refr...|
|8792559360|The Cigar - Momen...|The Cigar Book|[Appliances, Part...|
|9792954481|Caraselle 2X 50G ...|     Caraselle|[Appliances, Part...|
|B00002N5EL|EATON Wiring 39CH...|  EATON Wiring|[Appliances, Part...|
|B00002N7HY|Leviton 5050 B01-...|       Leviton|[Appliances, Part...|
|B00002N7IL|Leviton 5206 50 A...|       Leviton|[Appliances, Part...|
|B00002N9OE|Amerock BP5322-BJ...|       Amerock|[Appliances, Part...|
|B00002NARC|Leviton 5207 125/...|       Leviton|[Appliances, Part...|
|B00004SQHD|Coleman Cable 090...| Coleman Cable|[Appliances, Part...|
+----------+--------------------+--------------+--------------------+
only showing top 10 

In [12]:
metadata_Final = metadata_cleaned
metadata_Final.show()

+----------+--------------------+--------------+--------------------+
|      asin|               title|         brand|            category|
+----------+--------------------+--------------+--------------------+
|7301113188|Tupperware Freeze...|    Tupperware|[Appliances, Refr...|
|7861850250|2 X Tupperware Pu...|    Tupperware|[Appliances, Refr...|
|8792559360|The Cigar - Momen...|The Cigar Book|[Appliances, Part...|
|9792954481|Caraselle 2X 50G ...|     Caraselle|[Appliances, Part...|
|B00002N5EL|EATON Wiring 39CH...|  EATON Wiring|[Appliances, Part...|
|B00002N7HY|Leviton 5050 B01-...|       Leviton|[Appliances, Part...|
|B00002N7IL|Leviton 5206 50 A...|       Leviton|[Appliances, Part...|
|B00002N9OE|Amerock BP5322-BJ...|       Amerock|[Appliances, Part...|
|B00002NARC|Leviton 5207 125/...|       Leviton|[Appliances, Part...|
|B00004SQHD|Coleman Cable 090...| Coleman Cable|[Appliances, Part...|
|B00004SQHH|Coleman Cable 091...| Coleman Cable|[Appliances, Part...|
|B00004X18E|Electric

In [15]:
metadata_Final.write.mode("overwrite").parquet("file:///home/talentum/projects/AmazonReviewAnalytics/BigdataPipeline/data/cleaned/meta_Appliances_parquet")