In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, size, expr

# Initialize Spark session
spark = SparkSession.builder \
    .appName("product") \
    .getOrCreate()

# Define the JSON Lines file
product_path = 'data/Office_Products.jsonl'

# Read the JSON Lines file into a DataFrame
product_df = spark.read.json(product_path)

# Show the DataFrame (Optional)
product_df.show(truncate=False)

# Print the schema of the DataFrame (Optional)
product_df.printSchema()

# Count the total number of records
total_count = product_df.count()

# Count the number of records with at least one image URL
# Assuming 'images' is a field containing a list of image objects, each with a 'large' field
count_with_images = product_df.filter(expr("size(images) > 0")).count()

print(f"Total number of records: {total_count}")
print(f"Number of records with image URLs: {count_with_images}")

# Stop the Spark session
spark.stop()


24/07/22 16:11:25 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

+----------+------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

Total number of records: 12845712
Number of records with image URLs: 696549


In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr

# Initialize Spark session
spark = SparkSession.builder \
    .appName("filter_and_save") \
    .getOrCreate()

# Define the JSON Lines file
product_path = 'data/Office_Products.jsonl'
output_path = 'data/Products_With_Images.json'

# Read the JSON Lines file into a DataFrame
product_df = spark.read.json(product_path)

# Filter the DataFrame to include only records with image URLs
# Assuming 'images' is a field containing a list of image objects, each with a 'large' field
filtered_df = product_df.filter(expr("size(images) > 0"))

# Show the filtered DataFrame (Optional)
filtered_df.show(truncate=False)

# Write the filtered DataFrame to a single JSON file
filtered_df.coalesce(1).write.mode('overwrite').json(output_path)

print(f"Filtered records have been saved to {output_path}")

# Stop the Spark session
spark.stop()


                                                                                

+----------+------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

Filtered records have been saved to data/Products_With_Images.json


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when

# Initialize Spark session
spark = SparkSession.builder \
    .appName("null_counts") \
    .getOrCreate()

# Define the path to the JSON file
json_path = 'data/products.json'

# Read the JSON file into a DataFrame
df = spark.read.json(json_path)

# Show the DataFrame schema (Optional)
df.printSchema()

# Count nulls for each column
null_counts = {col_name: df.filter(col(col_name).isNull()).count() for col_name in df.columns}

# Display null counts
for col_name, count in null_counts.items():
    print(f"Column '{col_name}' has {count} null values.")

# Stop the Spark session
spark.stop()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/22 16:17:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

root
 |-- asin: string (nullable = true)
 |-- helpful_vote: long (nullable = true)
 |-- images: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- attachment_type: string (nullable = true)
 |    |    |-- large_image_url: string (nullable = true)
 |    |    |-- medium_image_url: string (nullable = true)
 |    |    |-- small_image_url: string (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- text: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- title: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- verified_purchase: boolean (nullable = true)



                                                                                

Column 'asin' has 0 null values.
Column 'helpful_vote' has 0 null values.
Column 'images' has 0 null values.
Column 'parent_asin' has 0 null values.
Column 'rating' has 0 null values.
Column 'text' has 0 null values.
Column 'timestamp' has 0 null values.
Column 'title' has 0 null values.
Column 'user_id' has 0 null values.
Column 'verified_purchase' has 0 null values.
