In [None]:
spark.stop()

In [3]:
from pyspark.sql import SparkSession

spark=SparkSession.builder\
.appName('Data Cleaning & Transformation')\
.master('yarn')\
.getOrCreate()

25/09/25 21:36:44 INFO SparkEnv: Registering MapOutputTracker
25/09/25 21:36:44 INFO SparkEnv: Registering BlockManagerMaster
25/09/25 21:36:44 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
25/09/25 21:36:44 INFO SparkEnv: Registering OutputCommitCoordinator


In [4]:
spark

In [6]:
!hadoop fs -ls /data

Found 1 items
drwxr-xr-x   - bhavishya_kollipara hadoop          0 2025-09-23 19:52 /data/olist


In [1]:
data_path='/data/olist/'
customer_df=spark.read.csv(data_path+'olist_customers_dataset.csv',header=True,inferSchema=True)
orders_df=spark.read.csv(data_path+'olist_orders_dataset.csv',header=True,inferSchema=True)
order_items_df=spark.read.csv(data_path+'olist_order_items_dataset.csv',header=True,inferSchema=True)
payments_df=spark.read.csv(data_path+'olist_order_payments_dataset.csv',header=True,inferSchema=True)
reviews_df=spark.read.csv(data_path+'olist_order_reviews_dataset.csv',header=True,inferSchema=True)
geolocation_df=spark.read.csv(data_path+'olist_geolocation_dataset.csv',header=True,inferSchema=True)
sellers_df=spark.read.csv(data_path+'olist_sellers_dataset.csv',header=True,inferSchema=True)
product_df=spark.read.csv(data_path+'olist_products_dataset.csv',header=True,inferSchema=True)
category_translation_df=spark.read.csv(data_path+'product_category_name_translation.csv',header=True,inferSchema=True)



                                                                                

In [2]:
from pyspark.sql.functions import *

In [27]:
#Identify Missing Values

def missing_values(df, df_name):
    print(f"Missing Values in Dataframe:{df_name}")
    df.select([count(when(col(c).isNull(),1)).alias(c) for c in df.columns]).show()

In [28]:
missing_values(customer_df, 'customer')

Missing Values in Dataframe:customer


[Stage 18:>                                                         (0 + 2) / 2]

+-----------+------------------+------------------------+-------------+--------------+
|customer_id|customer_unique_id|customer_zip_code_prefix|customer_city|customer_state|
+-----------+------------------+------------------------+-------------+--------------+
|          0|                 0|                       0|            0|             0|
+-----------+------------------+------------------------+-------------+--------------+



                                                                                

In [29]:
missing_values(orders_df,'orders')

Missing Values in Dataframe:orders




+--------+-----------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+
|order_id|customer_id|order_status|order_purchase_timestamp|order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------+-----------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+
|       0|          0|           0|                       0|              160|                        1783|                         2965|                            0|
+--------+-----------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+



                                                                                

In [31]:
missing_values(order_items_df,'order_item')

Missing Values in Dataframe:order_item




+--------+-------------+----------+---------+-------------------+-----+-------------+
|order_id|order_item_id|product_id|seller_id|shipping_limit_date|price|freight_value|
+--------+-------------+----------+---------+-------------------+-----+-------------+
|       0|            0|         0|        0|                  0|    0|            0|
+--------+-------------+----------+---------+-------------------+-----+-------------+



                                                                                

In [32]:
missing_values(payments_df,'payments')

Missing Values in Dataframe:payments
+--------+------------------+------------+--------------------+-------------+
|order_id|payment_sequential|payment_type|payment_installments|payment_value|
+--------+------------------+------------+--------------------+-------------+
|       0|                 0|           0|                   0|            0|
+--------+------------------+------------+--------------------+-------------+



In [34]:
missing_values(reviews_df, 'reviews')

Missing Values in Dataframe:reviews




+---------+--------+------------+--------------------+----------------------+--------------------+-----------------------+
|review_id|order_id|review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+---------+--------+------------+--------------------+----------------------+--------------------+-----------------------+
|        1|    2236|        2380|               92157|                 63079|                8764|                   8785|
+---------+--------+------------+--------------------+----------------------+--------------------+-----------------------+



                                                                                

In [35]:
missing_values(geolocation_df, 'Location')

Missing Values in Dataframe:Location




+---------------------------+---------------+---------------+----------------+-----------------+
|geolocation_zip_code_prefix|geolocation_lat|geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+---------------+---------------+----------------+-----------------+
|                          0|              0|              0|               0|                0|
+---------------------------+---------------+---------------+----------------+-----------------+



                                                                                

In [36]:
missing_values(sellers_df, 'seller')

Missing Values in Dataframe:seller
+---------+----------------------+-----------+------------+
|seller_id|seller_zip_code_prefix|seller_city|seller_state|
+---------+----------------------+-----------+------------+
|        0|                     0|          0|           0|
+---------+----------------------+-----------+------------+



In [37]:
missing_values(product_df, 'products')

Missing Values in Dataframe:products
+----------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+----------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|         0|                  610|                610|                       610|               610|               2|                2|                2|               2|
+----------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+



In [38]:
missing_values(category_translation_df, 'category_translation')

Missing Values in Dataframe:category_translation
+---------------------+-----------------------------+
|product_category_name|product_category_name_english|
+---------------------+-----------------------------+
|                    0|                            0|
+---------------------+-----------------------------+



# Handle Missing values

1. Drop Missing Values ( for non- critical columns )
2. Fill the missing Values  ( for non-continous( date, ordinal, categorical ) columns )
3. Impute missing Values ( for continous(numerical) data )



In [3]:
#Handling missing values in order_df - the above columns are more important so i f i have null values i will drop them

orders_df_cleaned= orders_df.na.drop(subset=['order_id','customer_id','order_status'])
orders_df_cleaned= orders_df_cleaned.fillna({'order_delivered_customer_date':'9999-12-31'})
orders_df_cleaned= orders_df_cleaned.fillna({'order_approved_at':'9999-12-31'})
orders_df_cleaned= orders_df_cleaned.fillna({'order_delivered_carrier_date':'9999-12-31'})

In [4]:
products_df_cleaned = product_df.na.drop(subset=["product_id"])
products_df_cleaned = products_df_cleaned.fillna({"product_category_name": "unknown"})
products_df_cleaned = products_df_cleaned.fillna({
    "product_name_lenght": 0,
    "product_description_lenght": 0,
    "product_photos_qty": 0
})


# Impute Missing Values

In [5]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=["payment_value"],
    outputCols=["payment_value_imputed"]
).setStrategy("median")

payments_df_cleaned = imputer.fit(payments_df).transform(payments_df)


                                                                                

# Standardizing the Format

In [49]:
def print_schema(df, df_name):
    print(f"Schema of {df_name}")
    df.printSchema()

In [50]:
print_schema(customer_df,'customer')

Schema of customer
root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [51]:
print_schema(orders_df,'orders')

Schema of orders
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)



In [52]:
print_schema(order_items_df,'order_item')

Schema of order_item
root
 |-- order_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)



In [53]:
print_schema(payments_df,'payments')

Schema of payments
root
 |-- order_id: string (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)



In [54]:
print_schema(reviews_df,'reviews')

Schema of reviews
root
 |-- review_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- review_score: string (nullable = true)
 |-- review_comment_title: string (nullable = true)
 |-- review_comment_message: string (nullable = true)
 |-- review_creation_date: string (nullable = true)
 |-- review_answer_timestamp: string (nullable = true)



In [55]:
print_schema(geolocation_df,'Location')

Schema of Location
root
 |-- geolocation_zip_code_prefix: integer (nullable = true)
 |-- geolocation_lat: double (nullable = true)
 |-- geolocation_lng: double (nullable = true)
 |-- geolocation_city: string (nullable = true)
 |-- geolocation_state: string (nullable = true)



In [56]:
print_schema(sellers_df,'seller')

Schema of seller
root
 |-- seller_id: string (nullable = true)
 |-- seller_zip_code_prefix: integer (nullable = true)
 |-- seller_city: string (nullable = true)
 |-- seller_state: string (nullable = true)



In [57]:
print_schema(product_df,'products')

Schema of products
root
 |-- product_id: string (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (nullable = true)
 |-- product_height_cm: integer (nullable = true)
 |-- product_width_cm: integer (nullable = true)



In [58]:
print_schema(category_translation_df,'category_translation')

Schema of category_translation
root
 |-- product_category_name: string (nullable = true)
 |-- product_category_name_english: string (nullable = true)



In [6]:
# how to remove timeStamp
order_df_cleaned=orders_df_cleaned.withColumn('order_purchase_timestamp', to_date(col('order_purchase_timestamp')))
order_df_cleaned.show(5)

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|              2017-10-02|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|              2018-07-24|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [7]:
#How to change schema of particular column

payments_df.withColumn('payment_value_int', payments_df.payment_value.cast('integer')).show(5)


+--------------------+------------------+------------+--------------------+-------------+-----------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|payment_value_int|
+--------------------+------------------+------------+--------------------+-------------+-----------------+
|b81ef226f3fe1789b...|                 1| credit_card|                   8|        99.33|               99|
|a9810da82917af2d9...|                 1| credit_card|                   1|        24.39|               24|
|25e8ea4e93396b6fa...|                 1| credit_card|                   1|        65.71|               65|
|ba78997921bbcdc13...|                 1| credit_card|                   8|       107.78|              107|
|42fdf880ba16b47b5...|                 1| credit_card|                   2|       128.45|              128|
+--------------------+------------------+------------+--------------------+-------------+-----------------+
only showing top 5 rows



In [8]:
#we don't know what boleto is in payment_type so we want to replace with another value

payments_df_cleaned=payments_df_cleaned.withColumn('payment_type', 
                                                   when(col('payment_type')=='boleto','Bank Transfer')
                                                   .when(col('payment_type')=='credit_card','Credit Card')
                                                   .when(col('payment_type')=='debit_card','Debit card')
                                                  .otherwise(col("payment_type")))

payments_df_cleaned.show()

+--------------------+------------------+-------------+--------------------+-------------+---------------------+
|            order_id|payment_sequential| payment_type|payment_installments|payment_value|payment_value_imputed|
+--------------------+------------------+-------------+--------------------+-------------+---------------------+
|b81ef226f3fe1789b...|                 1|  Credit Card|                   8|        99.33|                99.33|
|a9810da82917af2d9...|                 1|  Credit Card|                   1|        24.39|                24.39|
|25e8ea4e93396b6fa...|                 1|  Credit Card|                   1|        65.71|                65.71|
|ba78997921bbcdc13...|                 1|  Credit Card|                   8|       107.78|               107.78|
|42fdf880ba16b47b5...|                 1|  Credit Card|                   2|       128.45|               128.45|
|298fcdf1f73eb413e...|                 1|  Credit Card|                   2|        96.12|      

In [72]:
customer_df.show(5)

+--------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------------------+--------------------+--------------+
|06b8999e2fba1a1fb...|861eff4711a542e4b...|                   14409|              franca|            SP|
|18955e83d337fd6b2...|290c77bc529b7ac93...|                    9790|sao bernardo do c...|            SP|
|4e7b3e00288586ebd...|060e732b5b29e8181...|                    1151|           sao paulo|            SP|
|b2b6027bc5c5109e5...|259dac757896d24d7...|                    8775|     mogi das cruzes|            SP|
|4f2d8ab171c80ec83...|345ecd01c38d18a90...|                   13056|            campinas|            SP|
+--------------------+--------------------+------------------------+--------------------+--------------+
only showing top 5 rows



In [9]:
#customer_zip_code_prefix the schema defined this type as an integer but we have to make it string coz no operations should be performed on this because of its numerical value

customer_df_cleaned=customer_df.withColumn('customer_zip_code_prefix', customer_df.customer_zip_code_prefix.cast('string'))

In [10]:
customer_df_cleaned.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



# Remove Duplicate records

In [11]:
def remove_duplicates(df,df_name,columns):
    print(f"removed duplicates in {df_name}")
    df_cleaned =df.dropDuplicates(columns)
    return df_cleaned 

In [12]:
customer_cleaned_df=remove_duplicates(customer_df,'customer',['customer_id'])

removed duplicates in customer


In [13]:
order_df_cleaned=remove_duplicates(order_df_cleaned,'orders',['order_id'])


removed duplicates in orders


In [14]:
payments_df_cleaned=remove_duplicates(payments_df_cleaned,'payments',['order_id','payment_sequential'])

removed duplicates in payments


In [15]:
reviews_cleaned_df=remove_duplicates(reviews_df, 'reviews',["review_id"])

removed duplicates in reviews


In [16]:
seller_cleaned_df=remove_duplicates(sellers_df, 'seller',["seller_id"])

removed duplicates in seller


In [17]:
product_cleaned_df=remove_duplicates(product_df, 'product',["product_id"])

removed duplicates in product


In [18]:
category_translation_cleaned_df= remove_duplicates(category_translation_df, 'category Translation', ["product_category_name"])

removed duplicates in category Translation


# Transformations

In [19]:
order_with_details=order_df_cleaned.join(order_items_df, 'order_id', 'left')\
.join(payments_df_cleaned, 'order_id', 'left')\
.join(customer_cleaned_df, 'customer_id','left')

In [20]:
order_with_details.show(5)

[Stage 33:>                                                         (0 + 1) / 1]

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+--------------------+--------------------+-------------------+------+-------------+------------------+------------+--------------------+-------------+---------------------+--------------------+------------------------+-------------+--------------+
|         customer_id|            order_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|order_item_id|          product_id|           seller_id|shipping_limit_date| price|freight_value|payment_sequential|payment_type|payment_installments|payment_value|payment_value_imputed|  customer_unique_id|customer_zip_code_prefix|customer_city|customer_state|
+--------------------+--------------------+------------+------------------------+---------------

                                                                                

In [21]:
#Feature Engineering - Total_order_value

order_with_total_value=order_with_details.groupBy('order_id').agg(sum(col('payment_value')).alias('Total_order_value'))

In [22]:
order_with_total_value.show(5)

[Stage 40:>                                                         (0 + 1) / 1]

+--------------------+-----------------+
|            order_id|Total_order_value|
+--------------------+-----------------+
|118045506e1c1dda0...|           1802.0|
|f44cb69655f8e4d13...|           164.32|
|edcc6b79e8394346b...|           162.63|
|9f98d6530155e3b38...|           316.76|
|949280c70c6d62ec9...|            49.42|
+--------------------+-----------------+
only showing top 5 rows



                                                                                

In [24]:
#Avg delivery time
order_with_details.withColumn('Avg Delivery Time', datediff(col('order_delivered_customer_date'),col('order_purchase_timestamp')))

DataFrame[customer_id: string, order_id: string, order_status: string, order_purchase_timestamp: date, order_approved_at: timestamp, order_delivered_carrier_date: timestamp, order_delivered_customer_date: timestamp, order_estimated_delivery_date: timestamp, order_item_id: int, product_id: string, seller_id: string, shipping_limit_date: timestamp, price: double, freight_value: double, payment_sequential: int, payment_type: string, payment_installments: int, payment_value: double, payment_value_imputed: double, customer_unique_id: string, customer_zip_code_prefix: int, customer_city: string, customer_state: string, Avg Delivery Time: int]

# Advance Transformations

In [25]:
#0.01-1st percentile, 0.99- 99th percentile
quantiles=order_items_df.approxQuantile('price',[0.01,0.99],0.0)
low_cutoff, high_cutoff= quantiles[0],quantiles[1]

                                                                                

In [26]:
low_cutoff, high_cutoff

(9.99, 890.0)

In [29]:
 order_items_df_cleaned= order_items_df.filter((col('price')>=low_cutoff) &(col('price')<=high_cutoff) )

In [31]:
#How to find if the shipment is lightweight or heavy- this can be known by the product weight

product_cleaned_df.show(5)

+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|00066f42aeeb9f300...|           perfumaria|                 53|                       596|                 6|             300|               20|               16|              16|
|00088930e925c41fd...|           automotivo|                 56|                       752|                 4|            1225|               55|               10|              26|
|000d9be29b5207b54...|   relogios_presentes|                 48|                       613|    

In [33]:
product_cleaned_df=product_cleaned_df.withColumn('product_weight_category', 
                                                 when (col('product_weight_g') < 500,'lightweight')
                                                 .when(col('product_weight_g'). between(500,2000), 'medium')
                                                 .otherwise('Heavy'))

product_cleaned_df.show(5)

[Stage 48:>                                                         (0 + 1) / 1]

+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+-----------------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|product_weight_category|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+-----------------------+
|00066f42aeeb9f300...|           perfumaria|                 53|                       596|                 6|             300|               20|               16|              16|            lightweight|
|00088930e925c41fd...|           automotivo|                 56|                       752|                 4|            1225|               55|               10|              26|

                                                                                

In [39]:
#to know whether the package is small, medium, large. I am keeping the volume column as well to understand how much the volume is
product_cleaned_df = product_cleaned_df.withColumn(
    "volume_cm3",
    col("product_length_cm") * col("product_height_cm") * col("product_width_cm")
)

product_cleaned_df.show(5)

+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+-----------------------+----------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|product_weight_category|volume_cm3|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+-----------------------+----------+
|00066f42aeeb9f300...|           perfumaria|                 53|                       596|                 6|             300|               20|               16|              16|            lightweight|      5120|
|00088930e925c41fd...|           automotivo|                 56|                       752|                 4|            1225|         

In [41]:
#thiese thresholds are hardcoded we can do quantiles if we want symettrical data
product_cleaned_df=product_cleaned_df.withColumn("shipment_size", 
                              when(col('volume_cm3')<1000, "Small")
                              .when(col('volume_cm3').between(1000,10000), "Medium")
                              .otherwise('Large')
                             )

product_cleaned_df.show(5)

+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+-----------------------+----------+-------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|product_weight_category|volume_cm3|shipment_size|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+-----------------------+----------+-------------+
|00066f42aeeb9f300...|           perfumaria|                 53|                       596|                 6|             300|               20|               16|              16|            lightweight|      5120|       Medium|
|00088930e925c41fd...|           automotivo|                 56|                

In [45]:
#Total Revenue per seller

order_items_df_cleaned.groupBy('seller_id').agg(sum(col('price')).alias('total_revenue_per_seller')).orderBy('total_revenue_per_seller', ascending=False).show(5)



+--------------------+------------------------+
|           seller_id|total_revenue_per_seller|
+--------------------+------------------------+
|4869f7a5dfa277a7d...|      228572.63999999908|
|4a3ca9315b744ce9f...|       200472.9199999981|
|7c67e1448b00f6e96...|      187923.89000000118|
|fa1c13f2614d7b5c4...|       162793.4399999999|
|da8622b14eb17ae28...|       160196.9699999994|
+--------------------+------------------------+
only showing top 5 rows



                                                                                

In [47]:
!hadoop fs -mkdir /data/olist_processed

In [49]:
order_with_details.write.mode('overwrite').parquet('/data/olist_processed/cleaned_data.parquet')

                                                                                

In [50]:
!hadoop fs -ls /data/olist_processed/

Found 1 items
drwxr-xr-x   - root hadoop          0 2025-09-26 21:14 /data/olist_processed/cleaned_data.parquet


In [51]:
order_with_details.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: date (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)
 |-- payment_value_imputed: double (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- custo

In [52]:
#if we have lot of data it will take days to run to save that parquet file but we can use hive table as well as we have lot of columns in order_with_details schema i am doing for product


product_cleaned_df.write.mode('overwrite').parquet('/data/olist_processed/cleaned_product_data.parquet')


                                                                                

In [53]:
product_cleaned_df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (nullable = true)
 |-- product_height_cm: integer (nullable = true)
 |-- product_width_cm: integer (nullable = true)
 |-- product_weight_category: string (nullable = false)
 |-- volume_cm3: integer (nullable = true)
 |-- shipment_size: string (nullable = false)



In [None]:
#we should run this in terminal. pen terminal, then type hive, a shell will open there u can run this command
create EXTERNAL  TABLE cleaned_products(
    product STRING,
    product_category_name STRING,
    product_name_lenght INT,
    product_description_lenght STRING,
    product_photos_qty INT,
    product_weight_g INT,
    product_length_cm INT,
    product_height_cm INT,
    product_width_cm INT,
    product_weight_category STRING,
    volume_cm3 INT,
    shipment_size STRING

)
stored as parquet
Location '/data/olist_processed/cleaned_product_data.parqute';