### Objective:

- Save and retrieve processed data efficiently inside Dataproc.
- Serve data in a structured way for analysis.
- Use Parquet, Hive, and CSV 

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder \
.appName('Olist Ecommerce Performance Optmization') \
.config('spark.executor.memory','6g') \
.config('spark.executor.cores','4') \
.config('spark.executor.instances','2') \
.config('spark.driver.memory','4g') \
.config('spark.driver.maxResultSize','2g') \
.config('spark.sql.shuffle.partitions','64') \
.config('spark.default.parallelism','64') \
.config('spark.sql.adaptive.enabled','true') \
.config('spark.sql.adaptive.coalescePartition.enabled','true') \
.config('spark.sql.autoBroadcastJoinThreshold',20*1024*1024) \
.config('spark.sql.files.maxPartitionBytes','64MB') \
.config('spark.sql.files.openCostInBytes','2MB') \
.config('spark.memory.fraction',0.8) \
.config('spark.memory.storageFraction',0.2) \
.getOrCreate()

25/05/23 13:01:46 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
full_orders_df = spark.read.parquet('/data/olist_proc/full_orders_df_3.parquet')

                                                                                

In [5]:
full_orders_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (null

In [6]:
# save as Parquet in hdfs

full_orders_df.write.mode('overwrite').parquet('/olist/proc')

25/05/23 13:02:02 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [7]:
# Save is as a parquet in Google cloud storage

full_orders_df.write.mode('overwrite').parquet('gs://dataproc-staging-us-central1-26920286081-oi3gsnrk/temp_data')

                                                                                

In [8]:
full_orders_df.write.mode('overwrite').saveAsTable('full_order_detail')

ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used
25/05/23 13:06:56 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.


In [9]:
spark.sql('show tables')

DataFrame[namespace: string, tableName: string, isTemporary: boolean]

In [10]:
full_orders_df.write.mode('overwrite').option('header','true').csv('/olist/proc/')

25/05/23 13:09:43 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 4 for reason Container marked as failed: container_1748004378221_0004_01_000005 on host: cluster-7bde-w-0.us-central1-a.c.data-engineering-457905.internal. Exit status: -100. Diagnostics: Container released on a *lost* node.
25/05/23 13:09:43 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 2 for reason Container marked as failed: container_1748004378221_0004_01_000002 on host: cluster-7bde-w-0.us-central1-a.c.data-engineering-457905.internal. Exit status: -100. Diagnostics: Container released on a *lost* node.
25/05/23 13:09:43 ERROR YarnScheduler: Lost executor 4 on cluster-7bde-w-0.us-central1-a.c.data-engineering-457905.internal: Container marked as failed: container_1748004378221_0004_01_000005 on host: cluster-7bde-w-0.us-central1-a.c.data-engineering-457905.internal. Exit status: -100. Diagnostics: Container released on a *lost* node.
25/05

In [11]:
spark.stop()

25/05/23 13:12:40 WARN YarnAllocatorNodeHealthTracker: No available nodes reported, please check Resource Manager.
25/05/23 13:12:40 WARN ApplicationMaster: Reporter thread fails 1 time(s) in a row.
java.io.InterruptedIOException: Call interrupted
	at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1557) ~[hadoop-client-api-3.3.6.jar:?]
	at org.apache.hadoop.ipc.Client.call(Client.java:1509) ~[hadoop-client-api-3.3.6.jar:?]
	at org.apache.hadoop.ipc.Client.call(Client.java:1406) ~[hadoop-client-api-3.3.6.jar:?]
	at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:258) ~[hadoop-client-api-3.3.6.jar:?]
	at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:139) ~[hadoop-client-api-3.3.6.jar:?]
	at com.sun.proxy.$Proxy40.allocate(Unknown Source) ~[?:?]
	at org.apache.hadoop.yarn.api.impl.pb.client.ApplicationMasterProtocolPBClientImpl.allocate(ApplicationMasterProtocolPBClientImpl.java:78) ~[hadoop-client-api-3.3.6.jar:?]