In [40]:
spark

In [41]:
# Set the logging level for ERRORs only.
sc.setLogLevel("ERROR")

In [42]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

schema = StructType([
    StructField("asin", StringType(), nullable=True),
    StructField("authors", ArrayType(
        StructType([
            StructField("author_id", IntegerType(), nullable=True),
            StructField("role", StringType(), nullable=True)
        ])
    ), nullable=True),
    StructField("average_rating", StringType(), nullable=True),
    StructField("book_id", StringType(), nullable=True),
    StructField("country_code", StringType(), nullable=True),
    StructField("description", StringType(), nullable=True),
    StructField("edition_information", StringType(), nullable=True),
    StructField("format", StringType(), nullable=True),
    StructField("image_url", StringType(), nullable=True),
    StructField("is_ebook", StringType(), nullable=True),
    StructField("isbn", StringType(), nullable=True),
    StructField("isbn13", StringType(), nullable=True),
    StructField("kindle_asin", StringType(), nullable=True),
    StructField("language_code", StringType(), nullable=True),
    StructField("link", StringType(), nullable=True),
    StructField("num_pages", StringType(), nullable=True),
    StructField("popular_shelves", ArrayType(
        StructType([
            StructField("count", IntegerType(), nullable=True),
            StructField("name", StringType(), nullable=True)
        ])
    ), nullable=True),
    StructField("publication_day", StringType(), nullable=True),
    StructField("publication_month", StringType(), nullable=True),
    StructField("publication_year", StringType(), nullable=True),
    StructField("publisher", StringType(), nullable=True),
    StructField("ratings_count", StringType(), nullable=True),
    StructField("series", ArrayType(StringType()), nullable=True),
    StructField("similar_books", ArrayType(StringType()), nullable=True),
    StructField("text_reviews_count", StringType(), nullable=True),
    StructField("title", StringType(), nullable=True),
    StructField("title_without_series", StringType(), nullable=True),
    StructField("url", StringType(), nullable=True),
    StructField("work_id", StringType(), nullable=True)
])

goodreads_books_path = 'gs://my-bucket-apb/landing/goodreads_books.json'
goodreads_books_sdf = spark.read.json(goodreads_books_path, schema=schema)


In [43]:
goodreads_books_sdf.printSchema()

root
 |-- asin: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author_id: integer (nullable = true)
 |    |    |-- role: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- edition_information: string (nullable = true)
 |-- format: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- is_ebook: string (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: string (nullable = true)
 |-- kindle_asin: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- link: string (nullable = true)
 |-- num_pages: string (nullable = true)
 |-- popular_shelves: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- count: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- p

In [44]:
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)  # Adjust the value as needed

In [45]:
goodreads_books_sdf.show(10)

+----------+--------------------+--------------+--------+------------+--------------------+-------------------+---------+--------------------+--------+----------+-------------+-----------+-------------+--------------------+---------+--------------------+---------------+-----------------+----------------+--------------------+-------------+--------+--------------------+------------------+--------------------+--------------------+--------------------+--------+
|      asin|             authors|average_rating| book_id|country_code|         description|edition_information|   format|           image_url|is_ebook|      isbn|       isbn13|kindle_asin|language_code|                link|num_pages|     popular_shelves|publication_day|publication_month|publication_year|           publisher|ratings_count|  series|       similar_books|text_reviews_count|               title|title_without_series|                 url| work_id|
+----------+--------------------+--------------+--------+------------+------

In [46]:
from pyspark.sql.functions import concat, col, lit

# Create a new column called publication date by concatenating the information stored in year, month, day.  
goodreads_books_sdf = goodreads_books_sdf.withColumn(
    "publication_date",
    concat(
        col("publication_year"),
        lit("-"),
        col("publication_month"),
        lit("-"),
        col("publication_day")
    )
)

# Drop the publication_day, publication_month, and publication_year columns
goodreads_books_sdf = goodreads_books_sdf.drop("publication_day", "publication_month", "publication_year")


In [47]:
goodreads_books_sdf.printSchema()

root
 |-- asin: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author_id: integer (nullable = true)
 |    |    |-- role: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- edition_information: string (nullable = true)
 |-- format: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- is_ebook: string (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: string (nullable = true)
 |-- kindle_asin: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- link: string (nullable = true)
 |-- num_pages: string (nullable = true)
 |-- popular_shelves: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- count: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- p

In [48]:
# Show the new column and some of its entries
goodreads_books_sdf.select("publication_date").show(5, truncate=False)

+----------------+
|publication_date|
+----------------+
|1984-9-1        |
|2001-10-1       |
|1987--          |
|2009-7-14       |
|--              |
+----------------+
only showing top 5 rows



In [49]:
#Dropping unnecessary columns
goodreads_books_sdf = goodreads_books_sdf.drop("isbn13", "title_without_series", "work_id","image_url","link","url","asin", "country_code","edition_information","publisher","similar_books","is_ebook","isbn","kindle_asin","series")

In [50]:
goodreads_books_sdf.printSchema()

root
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author_id: integer (nullable = true)
 |    |    |-- role: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- description: string (nullable = true)
 |-- format: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- num_pages: string (nullable = true)
 |-- popular_shelves: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- count: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- ratings_count: string (nullable = true)
 |-- text_reviews_count: string (nullable = true)
 |-- title: string (nullable = true)
 |-- publication_date: string (nullable = true)



In [41]:
from pyspark.sql.functions import col, explode

# Explode the authors array to separate rows for each author
exploded_authors = goodreads_books_sdf.withColumn("author", explode("authors"))

# Select distinct author names
distinct_authors = exploded_authors.select("author.author_id", "author.role").distinct()

# Show distinct author names
distinct_authors.show(20,truncate=False)




+---------+---------------------------+
|author_id|role                       |
+---------+---------------------------+
|NULL     |guion                      |
|NULL     |Traduction                 |
|NULL     |ldktwr                     |
|NULL     |muHwr                      |
|NULL     |as Alex Dumas              |
|NULL     |introduction               |
|NULL     |Co-Author                  |
|NULL     |Contriutor                 |
|NULL     |Prologue                   |
|NULL     |Editor, Introduction, Notes|
|NULL     |co-author                  |
|NULL     |adapte du roman de         |
|NULL     |-Illustrator               |
|NULL     |Foreword, Photographer     |
|NULL     |Arranged by                |
|NULL     |Colourist                  |
|NULL     |Art                        |
|NULL     |Narrador                   |
|NULL     |Curator                    |
|NULL     |Colored                    |
+---------+---------------------------+
only showing top 20 rows



                                                                                

In [51]:
# Drop records where the specified columns are empty (null or nan)
goodreads_books_sdf = goodreads_books_sdf.na.drop(subset=["authors.author_id"])

In [52]:
goodreads_books_sdf.show(10)

+--------------------+--------------+--------+--------------------+---------+-------------+---------+--------------------+-------------+------------------+--------------------+----------------+
|             authors|average_rating| book_id|         description|   format|language_code|num_pages|     popular_shelves|ratings_count|text_reviews_count|               title|publication_date|
+--------------------+--------------+--------+--------------------+---------+-------------+---------+--------------------+-------------+------------------+--------------------+----------------+
|          [{NULL, }]|          4.00| 5333265|                    |Paperback|             |      256|[{NULL, to-read},...|            3|                 1|W.C. Fields: A Li...|        1984-9-1|
|          [{NULL, }]|          3.23| 1333909|Anita Diamant's i...| Audio CD|             |         |[{NULL, to-read},...|           10|                 6|         Good Harbor|       2001-10-1|
|          [{NULL, }]|        

In [15]:
# Count the number of null values in the authors.author_id column
null_count = goodreads_books_sdf.where(col("authors.author_id").isNull()).count()

# Print the number of null values
print("Number of null values in authors.author_id:", null_count)



Number of null values in authors.author_id: 0


                                                                                

In [53]:
from pyspark.sql.functions import col, to_date

# Filter out rows with invalid publication dates
cleaned_goodreads_books_sdf = goodreads_books_sdf.filter(
    col("publication_date").rlike("^\\d{4}-\\d{1,2}-\\d{1,2}$")
)

# Update the original DataFrame with the cleaned data
goodreads_books_sdf = cleaned_goodreads_books_sdf

In [13]:
from pyspark.sql.functions import col

# Find distinct values in the num_pages column with a limit of 20
distinct_num_pages = goodreads_books_sdf.select("num_pages").distinct().limit(20)

distinct_num_pages.show()



+---------+
|num_pages|
+---------+
|      296|
|      451|
|     1280|
|        7|
|      475|
|      307|
|      383|
|     1008|
|      700|
|      886|
|      154|
|      714|
|      428|
|      854|
|      422|
|      595|
|     1856|
|      323|
|      424|
|      586|
+---------+



[Stage 9:>                                                          (0 + 1) / 1]                                                                                

In [54]:
# Filter out rows with missing or null values in the 'format' column
goodreads_books_sdf = goodreads_books_sdf.filter(goodreads_books_sdf["format"].isNotNull() & (goodreads_books_sdf["format"] != ""))


In [46]:
from pyspark.sql.functions import col

# Count the number of null or empty values in the 'format' column
null_format_count = goodreads_books_sdf.filter(col("format").isNull() | (col("format") == "")).count()

# Show the count of null or empty values in the 'format' column
print("Number of null or empty values in the 'format' column:", null_format_count)



Number of null or empty values in the 'format' column: 0


                                                                                

In [55]:
from pyspark.sql.types import IntegerType

# Convert the num_pages column to an integer type
goodreads_books_sdf = goodreads_books_sdf.withColumn("num_pages", goodreads_books_sdf["num_pages"].cast(IntegerType()))


In [56]:
goodreads_books_sdf = goodreads_books_sdf.drop("language_code")

In [57]:
# Remove rows with no title column
goodreads_books_sdf = goodreads_books_sdf.filter(col("title").isNotNull())

# Count the number of null values in the title column
null_titles_count = goodreads_books_sdf.filter(col("title").isNull()).count()

print("Number of null values in title column:", null_titles_count)




Number of null values in title column: 0


                                                                                

In [58]:
goodreads_books_sdf.show(10)

+--------------------+--------------+--------+--------------------+---------+---------+--------------------+-------------+------------------+--------------------+----------------+
|             authors|average_rating| book_id|         description|   format|num_pages|     popular_shelves|ratings_count|text_reviews_count|               title|publication_date|
+--------------------+--------------+--------+--------------------+---------+---------+--------------------+-------------+------------------+--------------------+----------------+
|          [{NULL, }]|          4.00| 5333265|                    |Paperback|      256|[{NULL, to-read},...|            3|                 1|W.C. Fields: A Li...|        1984-9-1|
|          [{NULL, }]|          3.23| 1333909|Anita Diamant's i...| Audio CD|     NULL|[{NULL, to-read},...|           10|                 6|         Good Harbor|       2001-10-1|
|          [{NULL, }]|          3.49| 6066819|Addie Downs and V...|Hardcover|      368|[{NULL, to-re

In [59]:
from pyspark.sql.functions import col, trim

# Remove leading and trailing spaces from the description column
cleaned_description_df = goodreads_books_sdf.withColumn("description", trim(col("description")))

# Update the DataFrame
goodreads_books_sdf = cleaned_description_df

In [60]:
from pyspark.sql.functions import col

# Convert columns to integer type
goodreads_books_sdf = goodreads_books_sdf.withColumn("average_rating", col("average_rating").cast("float")) \
    .withColumn("book_id", col("book_id").cast("integer")) \
    .withColumn("ratings_count", col("ratings_count").cast("integer")) \
    .withColumn("text_reviews_count", col("text_reviews_count").cast("integer"))

#check to see if change is reflected
goodreads_books_sdf.printSchema()

root
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author_id: integer (nullable = true)
 |    |    |-- role: string (nullable = true)
 |-- average_rating: float (nullable = true)
 |-- book_id: integer (nullable = true)
 |-- description: string (nullable = true)
 |-- format: string (nullable = true)
 |-- num_pages: integer (nullable = true)
 |-- popular_shelves: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- count: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- ratings_count: integer (nullable = true)
 |-- text_reviews_count: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- publication_date: string (nullable = true)



In [61]:
goodreads_books_sdf.show(10)

+--------------------+--------------+--------+--------------------+---------+---------+--------------------+-------------+------------------+--------------------+----------------+
|             authors|average_rating| book_id|         description|   format|num_pages|     popular_shelves|ratings_count|text_reviews_count|               title|publication_date|
+--------------------+--------------+--------+--------------------+---------+---------+--------------------+-------------+------------------+--------------------+----------------+
|          [{NULL, }]|           4.0| 5333265|                    |Paperback|      256|[{NULL, to-read},...|            3|                 1|W.C. Fields: A Li...|        1984-9-1|
|          [{NULL, }]|          3.23| 1333909|Anita Diamant's i...| Audio CD|     NULL|[{NULL, to-read},...|           10|                 6|         Good Harbor|       2001-10-1|
|          [{NULL, }]|          3.49| 6066819|Addie Downs and V...|Hardcover|      368|[{NULL, to-re

[Stage 30:>                                                         (0 + 1) / 1]                                                                                

In [62]:
from pyspark.sql.functions import col

# Replace null values in num_pages column with 0
goodreads_books_sdf = goodreads_books_sdf.fillna({'num_pages': 0})


In [63]:
goodreads_books_sdf.show(10)

+--------------------+--------------+--------+--------------------+---------+---------+--------------------+-------------+------------------+--------------------+----------------+
|             authors|average_rating| book_id|         description|   format|num_pages|     popular_shelves|ratings_count|text_reviews_count|               title|publication_date|
+--------------------+--------------+--------+--------------------+---------+---------+--------------------+-------------+------------------+--------------------+----------------+
|          [{NULL, }]|           4.0| 5333265|                    |Paperback|      256|[{NULL, to-read},...|            3|                 1|W.C. Fields: A Li...|        1984-9-1|
|          [{NULL, }]|          3.23| 1333909|Anita Diamant's i...| Audio CD|        0|[{NULL, to-read},...|           10|                 6|         Good Harbor|       2001-10-1|
|          [{NULL, }]|          3.49| 6066819|Addie Downs and V...|Hardcover|      368|[{NULL, to-re

In [64]:
# Count the number of null and empty values in num_pages column
null_empty_count = goodreads_books_sdf.filter((col("num_pages").isNull()) | (col("num_pages") == "")).count()

print("Number of null or empty values in num_pages column:", null_empty_count)

Number of null or empty values in num_pages column: 0


In [65]:
import pandas as pd
from google.cloud import storage

client = storage.Client()

x = 1000000  # Number of reviews to process

file_path = "gs://my-bucket-apb/landing/goodreads_reviews_dedup.json"
print("Reading from", file_path)

# Read JSON file in chunks
chunks = pd.read_json(file_path, orient="records", lines=True, chunksize=x)

# Initialize DataFrame and counter
df = pd.DataFrame()
chunk_count = 0

# Loop through chunks (should be only one chunk with 1 million reviews)
for chunk in chunks:
    df = pd.concat([df, chunk], ignore_index=True)
    chunk_count += 1  

    # Check if the desired number of chunks has been processed (should be only one chunk)
    if chunk_count >= 1:
        break  

# Display the DataFrame
print(df)

Reading from gs://my-bucket-apb/landing/goodreads_reviews_dedup.json
                                 user_id   book_id  \
0       8842281e1d1347389f2ab93d60773d4d  24375664   
1       8842281e1d1347389f2ab93d60773d4d  18245960   
2       8842281e1d1347389f2ab93d60773d4d   6392944   
3       8842281e1d1347389f2ab93d60773d4d  22078596   
4       8842281e1d1347389f2ab93d60773d4d   6644782   
...                                  ...       ...   
999995  f131126e97b09f87010f4d419391ee9f  13331204   
999996  f131126e97b09f87010f4d419391ee9f  15843480   
999997  f131126e97b09f87010f4d419391ee9f  14291982   
999998  f131126e97b09f87010f4d419391ee9f  13093165   
999999  f131126e97b09f87010f4d419391ee9f  12975883   

                               review_id  rating  \
0       5cd416f3efc3f944fce4ce2db2290d5e       5   
1       dfdbb7b0eb5a7e4c26d59a937e2e5feb       5   
2       5e212a62bced17b4dbe41150e5bb9037       3   
3       fdd13cad0695656be99828cd75d6eb73       4   
4       bd0df91c9d918c

In [66]:
sdf_reviews = spark.createDataFrame(df)

sdf_reviews.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- book_id: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- rating: long (nullable = true)
 |-- review_text: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- date_updated: string (nullable = true)
 |-- read_at: string (nullable = true)
 |-- started_at: string (nullable = true)
 |-- n_votes: long (nullable = true)
 |-- n_comments: long (nullable = true)



In [67]:
from pyspark.sql.functions import col

# Convert book_id to integer
sdf_reviews = sdf_reviews.withColumn("book_id", sdf_reviews["book_id"].cast("integer"))

# Convert rating to integer
sdf_reviews = sdf_reviews.withColumn("rating", sdf_reviews["rating"].cast("integer"))

sdf_reviews = sdf_reviews.withColumn("n_votes", sdf_reviews["n_votes"].cast("integer"))

sdf_reviews = sdf_reviews.withColumn("n_comments", sdf_reviews["n_comments"].cast("integer"))


# Print the updated schema to confirm the changes
sdf_reviews.printSchema()


root
 |-- user_id: string (nullable = true)
 |-- book_id: integer (nullable = true)
 |-- review_id: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- date_updated: string (nullable = true)
 |-- read_at: string (nullable = true)
 |-- started_at: string (nullable = true)
 |-- n_votes: integer (nullable = true)
 |-- n_comments: integer (nullable = true)



In [68]:
from pyspark.sql.functions import col

# Drop rows with empty book_id in sdf_reviews
sdf_reviews_formatted = sdf_reviews.na.drop(subset=["book_id"])


In [72]:
from pyspark.sql.functions import length, col

# Add a new column 'review_text_length' with the length of review_text
sdf_reviews_formatted = sdf_reviews_formatted.withColumn('review_text_length', length(col('review_text')))

# Select and display only the 'review_text' and 'review_text_length' columns
sdf_reviews_formatted.select('review_text', 'review_text_length').show(5)


24/04/24 19:19:59 WARN TaskSetManager: Stage 35 contains a task of very large size (222887 KiB). The maximum recommended task size is 1000 KiB.
[Stage 35:>                                                         (0 + 1) / 1]

+--------------------+------------------+
|         review_text|review_text_length|
+--------------------+------------------+
|Mind blowingly co...|               968|
|This is a special...|              2086|
|I haven't read a ...|               474|
|Fun, fast paced, ...|               962|
|A fun book that g...|               420|
+--------------------+------------------+
only showing top 5 rows



                                                                                

In [73]:
sdf_reviews_formatted.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- book_id: integer (nullable = true)
 |-- review_id: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- date_updated: string (nullable = true)
 |-- n_votes: integer (nullable = true)
 |-- n_comments: integer (nullable = true)
 |-- date_added_formatted: string (nullable = true)
 |-- read_at_formatted: string (nullable = true)
 |-- started_at_formatted: string (nullable = true)
 |-- review_text_length: integer (nullable = true)



In [75]:
from pyspark.sql.functions import col, to_date, date_format

# Define a function to parse and format the date strings
def format_date_string(date_str):
    return date_format(to_date(date_str, "EEE MMM dd HH:mm:ss Z yyyy"), "yy-MM-dd")

# Apply the function using withColumn to create new formatted columns
sdf_reviews_formatted = sdf_reviews.withColumn(
    "date_added_formatted",
    format_date_string("date_added")
).withColumn(
    "read_at_formatted",
    format_date_string("read_at")
).withColumn(
    "started_at_formatted",
    format_date_string("started_at")
)

# Optionally drop the original columns if needed
sdf_reviews_formatted = sdf_reviews_formatted.drop("date_added", "read_at", "started_at")

from pyspark.sql.functions import length, col

# Add a new column 'review_text_length' with the length of review_text
sdf_reviews_formatted = sdf_reviews_formatted.withColumn('review_text_length', length(col('review_text')))

# Select and display only the 'review_text' and 'review_text_length' columns
sdf_reviews_formatted.select('review_text', 'review_text_length').show(5)



# Show the DataFrame schema to confirm the changes
sdf_reviews_formatted.printSchema()



24/04/24 19:20:36 WARN TaskSetManager: Stage 36 contains a task of very large size (222887 KiB). The maximum recommended task size is 1000 KiB.
[Stage 36:>                                                         (0 + 1) / 1]

+--------------------+------------------+
|         review_text|review_text_length|
+--------------------+------------------+
|Mind blowingly co...|               968|
|This is a special...|              2086|
|I haven't read a ...|               474|
|Fun, fast paced, ...|               962|
|A fun book that g...|               420|
+--------------------+------------------+
only showing top 5 rows

root
 |-- user_id: string (nullable = true)
 |-- book_id: integer (nullable = true)
 |-- review_id: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- date_updated: string (nullable = true)
 |-- n_votes: integer (nullable = true)
 |-- n_comments: integer (nullable = true)
 |-- date_added_formatted: string (nullable = true)
 |-- read_at_formatted: string (nullable = true)
 |-- started_at_formatted: string (nullable = true)
 |-- review_text_length: integer (nullable = true)



                                                                                

In [76]:
# Drop the 'review_id' column from the DataFrame
sdf_reviews_formatted = sdf_reviews_formatted.drop('review_id')

# Show the DataFrame schema to confirm the changes
sdf_reviews_formatted.printSchema()


root
 |-- user_id: string (nullable = true)
 |-- book_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- date_updated: string (nullable = true)
 |-- n_votes: integer (nullable = true)
 |-- n_comments: integer (nullable = true)
 |-- date_added_formatted: string (nullable = true)
 |-- read_at_formatted: string (nullable = true)
 |-- started_at_formatted: string (nullable = true)
 |-- review_text_length: integer (nullable = true)



In [77]:
# Set the configuration for datetime pattern recognition
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [78]:
sdf_reviews_formatted.show(10)

24/04/24 19:21:02 WARN TaskSetManager: Stage 37 contains a task of very large size (222887 KiB). The maximum recommended task size is 1000 KiB.
[Stage 37:>                                                         (0 + 1) / 1]

+--------------------+--------+------+--------------------+--------------------+-------+----------+--------------------+-----------------+--------------------+------------------+
|             user_id| book_id|rating|         review_text|        date_updated|n_votes|n_comments|date_added_formatted|read_at_formatted|started_at_formatted|review_text_length|
+--------------------+--------+------+--------------------+--------------------+-------+----------+--------------------+-----------------+--------------------+------------------+
|8842281e1d1347389...|24375664|     5|Mind blowingly co...|Mon Oct 09 08:55:...|     16|         0|            17-08-25|         17-10-07|            17-08-26|               968|
|8842281e1d1347389...|18245960|     5|This is a special...|Wed Aug 30 00:00:...|     28|         1|            17-07-30|         17-08-26|            17-08-15|              2086|
|8842281e1d1347389...| 6392944|     3|I haven't read a ...|Sun Jul 30 09:28:...|      6|         0|      

                                                                                

In [79]:
# Assuming 'book_id' is the common column in both DataFrames
joined_df = goodreads_books_sdf.join(
    sdf_reviews_formatted,
    on=['book_id'],  # Specify the join condition here
    how='inner'  # Use the appropriate join type
)

# Show the first few rows to verify the merge
joined_df.show(5)

24/04/24 19:21:22 WARN TaskSetManager: Stage 39 contains a task of very large size (222887 KiB). The maximum recommended task size is 1000 KiB.

+-------+----------+--------------+--------------------+---------+---------+--------------------+-------------+------------------+--------------------+----------------+--------------------+------+--------------------+--------------------+-------+----------+--------------------+-----------------+--------------------+------------------+
|book_id|   authors|average_rating|         description|   format|num_pages|     popular_shelves|ratings_count|text_reviews_count|               title|publication_date|             user_id|rating|         review_text|        date_updated|n_votes|n_comments|date_added_formatted|read_at_formatted|started_at_formatted|review_text_length|
+-------+----------+--------------+--------------------+---------+---------+--------------------+-------------+------------------+--------------------+----------------+--------------------+------+--------------------+--------------------+-------+----------+--------------------+-----------------+--------------------+---------

[Stage 42:>                                                         (0 + 1) / 1]                                                                                

In [80]:
# Drop the date_updated column
joined_df = joined_df.drop("date_updated")

# Show the updated DataFrame schema to confirm the changes
joined_df.printSchema()


root
 |-- book_id: integer (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author_id: integer (nullable = true)
 |    |    |-- role: string (nullable = true)
 |-- average_rating: float (nullable = true)
 |-- description: string (nullable = true)
 |-- format: string (nullable = true)
 |-- num_pages: integer (nullable = false)
 |-- popular_shelves: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- count: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- ratings_count: integer (nullable = true)
 |-- text_reviews_count: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- publication_date: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- n_votes: integer (nullable = true)
 |-- n_comments: integer (nullable = true)
 |-- date_added_formatted: s

In [81]:
# Define the output path in your GCS bucket
gcs_output_path = 'gs://my-bucket-apb/cleaned4/'

# Write the DataFrame to Parquet format directly to GCS
joined_df.write.mode("overwrite").parquet(gcs_output_path)

print("Write to Parquet completed successfully in Google Cloud Storage.")



24/04/24 19:22:43 WARN TaskSetManager: Stage 44 contains a task of very large size (222887 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Write to Parquet completed successfully in Google Cloud Storage.
