In [13]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit

In [14]:
spark = SparkSession.builder.appName('Hadoop and Spark Samples').getOrCreate()

+ Read a csv file, display contents and write as parquet

In [15]:
df_sales_order = spark.read.csv('Exercise Files\sales_orders.csv', header=True)
df_sales_order.show(5)

+---+--------+--------+----------+--------+-----+---------------+
| ID|Customer| Product|      Date|Quantity| Rate|           Tags|
+---+--------+--------+----------+--------+-----+---------------+
|  1|   Apple|Keyboard|2019/11/21|       5|31.15|Discount:Urgent|
|  2|LinkedIn| Headset|2019/11/25|       5| 36.9|  Urgent:Pickup|
|  3|Facebook|Keyboard|2019/11/24|       5|49.89|           NULL|
|  4|  Google|  Webcam|2019/11/07|       4|34.21|       Discount|
|  5|LinkedIn|  Webcam|2019/11/21|       3|48.69|         Pickup|
+---+--------+--------+----------+--------+-----+---------------+
only showing top 5 rows



### Handling parquet files

+ Write the dataframe as a parquet file

In [16]:
df_sales_order.write \
    .option('format','parquet') \
        .option('compress','snappy') \
            .mode('overwrite') \
                .save('output\parquet\sales_order')

+ Read parquet file into dataframe

In [17]:
df_parquet = spark.read.parquet('output\parquet\sales_order')
df_parquet.show(5)

+---+--------+--------+----------+--------+-----+---------------+
| ID|Customer| Product|      Date|Quantity| Rate|           Tags|
+---+--------+--------+----------+--------+-----+---------------+
|  1|   Apple|Keyboard|2019/11/21|       5|31.15|Discount:Urgent|
|  2|LinkedIn| Headset|2019/11/25|       5| 36.9|  Urgent:Pickup|
|  3|Facebook|Keyboard|2019/11/24|       5|49.89|           NULL|
|  4|  Google|  Webcam|2019/11/07|       4|34.21|       Discount|
|  5|LinkedIn|  Webcam|2019/11/21|       3|48.69|         Pickup|
+---+--------+--------+----------+--------+-----+---------------+
only showing top 5 rows



In [18]:
df_parquet.explain()

== Physical Plan ==
*(1) ColumnarToRow
+- FileScan parquet [ID#344,Customer#345,Product#346,Date#347,Quantity#348,Rate#349,Tags#350] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/c:/Users/bidem/Projects/LinkedIn/bda_hadoop_spark/output/parquet..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<ID:string,Customer:string,Product:string,Date:string,Quantity:string,Rate:string,Tags:string>




In [19]:
df_parquet = df_parquet.withColumn('Total', (col('Rate') * col('Quantity')).cast('decimal')) \
    .groupBy('Customer','Product','Tags', 'Total').agg({'Rate' : 'Mean', 'Quantity': 'sum'}) \
        .orderBy('Customer','Product', ascending=[True,False]) \
        .toDF('Customer','Product','Tags', 'Total', 'Rate', 'Quantity')

In [20]:
df_parquet.write.csv('output\csv\sales_order_summary',mode='overwrite', header=True)

### Handling partitioned files

+ Write files by partitions

In [21]:
df_sales_order.write \
    .option('format','parquet') \
        .option('compress','snappy') \
            .mode('overwrite') \
                .partitionBy('product') \
                    .save('output\parquet\sales_order_product_partition')

In [22]:
print(f'Total records in partitoned file : {df_sales_order.count()}')

Total records in partitoned file : 100


+ Read specific file by partition

In [23]:
spark.read.parquet('output\parquet\sales_order_product_partition\product=headset').count()
#spark.read.parquet('output\parquet\sales_order_product_partition\product=headset').explain()

23

+ Read all partitions from disc

In [24]:
spark.read.parquet('output\parquet\sales_order_product_partition\*').count()

100