In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark= SparkSession.\
    builder.\
    config('spark.ui.port','0').\
    config("spark.sql.warehouse.dir",f"/user/{username}/warehouse").\
    enableHiveSupport().\
    master('yarn').\
    getOrCreate()

In [2]:
spark

In [3]:
retail_df = spark.read \
.format("csv") \
.option("header","true") \
.option("inferSchema","true") \
.load("retail")

In [4]:
retail_df.show(2)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 2 rows



In [23]:
retail_df.write \
.mode("overwrite") \
.option("path","retailsimple") \
.save()

In [24]:
!hadoop fs -ls retailsimple

Found 3 items
-rw-r--r--   3 itv007180 supergroup          0 2023-08-21 15:23 retailsimple/_SUCCESS
-rw-r--r--   3 itv007180 supergroup    1985251 2023-08-21 15:23 retailsimple/part-00000-d30d5175-e88b-4ca5-a739-4bf3ba7f49d1-c000.snappy.parquet
-rw-r--r--   3 itv007180 supergroup    1566498 2023-08-21 15:23 retailsimple/part-00001-d30d5175-e88b-4ca5-a739-4bf3ba7f49d1-c000.snappy.parquet


In [25]:
retail_df.write \
.partitionBy("Country") \
.mode("overwrite") \
.option("path","retailwithpartition").save()

In [26]:
!hadoop fs -ls retailwithpartition

Found 39 items
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:23 retailwithpartition/Country=Australia
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:23 retailwithpartition/Country=Austria
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:23 retailwithpartition/Country=Bahrain
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:23 retailwithpartition/Country=Belgium
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:23 retailwithpartition/Country=Brazil
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:23 retailwithpartition/Country=Canada
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:23 retailwithpartition/Country=Channel Islands
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:23 retailwithpartition/Country=Cyprus
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:23 retailwithpartition/Country=Czech Republic
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:23 retailw

In [12]:
!hadoop fs -ls retailwithpartition/Country=Spain

Found 2 items
-rw-r--r--   3 itv007180 supergroup      25283 2023-08-21 15:15 retailwithpartition/Country=Spain/part-00000-53190ccd-73c5-4462-bbfa-78647f7b836a.c000.snappy.parquet
-rw-r--r--   3 itv007180 supergroup      23284 2023-08-21 15:15 retailwithpartition/Country=Spain/part-00001-53190ccd-73c5-4462-bbfa-78647f7b836a.c000.snappy.parquet


In [19]:
retail_df.write \
.format("csv") \
.mode("overwrite") \
.partitionBy("Country") \
.option("path","retailwithpartitioncsv").save()

In [21]:
!hadoop fs -ls retailwithpartitioncsv

Found 39 items
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:20 retailwithpartitioncsv/Country=Australia
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:20 retailwithpartitioncsv/Country=Austria
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:20 retailwithpartitioncsv/Country=Bahrain
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:20 retailwithpartitioncsv/Country=Belgium
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:20 retailwithpartitioncsv/Country=Brazil
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:20 retailwithpartitioncsv/Country=Canada
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:20 retailwithpartitioncsv/Country=Channel Islands
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:20 retailwithpartitioncsv/Country=Cyprus
drwxr-xr-x   - itv007180 supergroup          0 2023-08-21 15:20 retailwithpartitioncsv/Country=Czech Republic
drwxr-xr-x   - itv007180 supergroup         

In [27]:
!hadoop fs -ls retailwithpartitioncsv/Country=Spain | head

Found 2 items
-rw-r--r--   3 itv007180 supergroup      93977 2023-08-21 15:20 retailwithpartitioncsv/Country=Spain/part-00000-7e7cba55-6443-42f2-a3b5-596872cf8a53.c000.csv
-rw-r--r--   3 itv007180 supergroup      82433 2023-08-21 15:20 retailwithpartitioncsv/Country=Spain/part-00001-7e7cba55-6443-42f2-a3b5-596872cf8a53.c000.csv


In [38]:
retail_df.write \
.mode("overwrite") \
.format("csv") \
.bucketBy(5,"CustomerID") \
.option("path","retailwithbucket") \
.saveAsTable("abhi_db.retailwithbucket")

In [39]:
!hadoop fs -ls retailwithbucket

Found 11 items
-rw-r--r--   3 itv007180 supergroup          0 2023-08-21 15:33 retailwithbucket/_SUCCESS
-rw-r--r--   3 itv007180 supergroup    3806529 2023-08-21 15:33 retailwithbucket/part-00000-88e7a9e0-400a-407f-9152-a18c1dc7f780_00000.c000.csv
-rw-r--r--   3 itv007180 supergroup    2959854 2023-08-21 15:33 retailwithbucket/part-00000-88e7a9e0-400a-407f-9152-a18c1dc7f780_00001.c000.csv
-rw-r--r--   3 itv007180 supergroup   10352587 2023-08-21 15:33 retailwithbucket/part-00000-88e7a9e0-400a-407f-9152-a18c1dc7f780_00002.c000.csv
-rw-r--r--   3 itv007180 supergroup    3879280 2023-08-21 15:33 retailwithbucket/part-00000-88e7a9e0-400a-407f-9152-a18c1dc7f780_00003.c000.csv
-rw-r--r--   3 itv007180 supergroup    3714590 2023-08-21 15:33 retailwithbucket/part-00000-88e7a9e0-400a-407f-9152-a18c1dc7f780_00004.c000.csv
-rw-r--r--   3 itv007180 supergroup    3913879 2023-08-21 15:33 retailwithbucket/part-00001-88e7a9e0-400a-407f-9152-a18c1dc7f780_00000.c000.csv
-rw-r--r--   3 itv007180 superg

In [41]:
!hadoop fs -cat retailwithbucket/part-00000-88e7a9e0-400a-407f-9152-a18c1dc7f780_00000.c000.csv | head

536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850,United Kingdom
536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850,United Kingdom
536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850,United Kingdom
536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850,United Kingdom
536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850,United Kingdom
536365,22752,SET 7 BABUSHKA NESTING BOXES,2,12/1/2010 8:26,7.65,17850,United Kingdom
536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,12/1/2010 8:26,4.25,17850,United Kingdom
536366,22633,HAND WARMER UNION JACK,6,12/1/2010 8:28,1.85,17850,United Kingdom
536366,22632,HAND WARMER RED POLKA DOT,6,12/1/2010 8:28,1.85,17850,United Kingdom
536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,12/1/2010 8:34,1.69,13047,United Kingdom
cat: Unable to write to output stream.


In [43]:
retail_df.write \
.mode("overwrite") \
.format("csv") \
.bucketBy(5,"CustomerID") \
.sortBy("CustomerID") \
.option("path","retailwithbucket") \
.saveAsTable("abhi_db.retailwithbucketsort")

In [44]:
!hadoop fs -ls retailwithbucket

Found 11 items
-rw-r--r--   3 itv007180 supergroup          0 2023-08-21 15:34 retailwithbucket/_SUCCESS
-rw-r--r--   3 itv007180 supergroup    3806529 2023-08-21 15:34 retailwithbucket/part-00000-32782cfb-d727-48b2-b11e-75e2f2e50fb4_00000.c000.csv
-rw-r--r--   3 itv007180 supergroup    2959854 2023-08-21 15:34 retailwithbucket/part-00000-32782cfb-d727-48b2-b11e-75e2f2e50fb4_00001.c000.csv
-rw-r--r--   3 itv007180 supergroup   10352587 2023-08-21 15:34 retailwithbucket/part-00000-32782cfb-d727-48b2-b11e-75e2f2e50fb4_00002.c000.csv
-rw-r--r--   3 itv007180 supergroup    3879280 2023-08-21 15:34 retailwithbucket/part-00000-32782cfb-d727-48b2-b11e-75e2f2e50fb4_00003.c000.csv
-rw-r--r--   3 itv007180 supergroup    3714590 2023-08-21 15:34 retailwithbucket/part-00000-32782cfb-d727-48b2-b11e-75e2f2e50fb4_00004.c000.csv
-rw-r--r--   3 itv007180 supergroup    3913879 2023-08-21 15:34 retailwithbucket/part-00001-32782cfb-d727-48b2-b11e-75e2f2e50fb4_00000.c000.csv
-rw-r--r--   3 itv007180 superg

In [46]:
!hadoop fs -cat retailwithbucket/part-00000-32782cfb-d727-48b2-b11e-75e2f2e50fb4_00000.c000.csv | head

554132,21094,SET/6 RED SPOTTY PAPER PLATES,12,5/23/2011 9:43,0.85,12360,Austria
554132,21086,SET/6 RED SPOTTY PAPER CUPS,12,5/23/2011 9:43,0.65,12360,Austria
554132,21080,SET/20 RED RETROSPOT PAPER NAPKINS,12,5/23/2011 9:43,0.85,12360,Austria
554132,21989,PACK OF 20 SKULL PAPER NAPKINS,12,5/23/2011 9:43,0.85,12360,Austria
554132,23007,SPACEBOY BABY GIFT SET,6,5/23/2011 9:43,14.95,12360,Austria
554132,23010,CIRCUS PARADE BABY GIFT SET,6,5/23/2011 9:43,14.95,12360,Austria
554132,23008,DOLLY GIRL BABY GIFT SET,6,5/23/2011 9:43,14.95,12360,Austria
554132,84674,FLYING PIG WATERING CAN,6,5/23/2011 9:43,2.95,12360,Austria
554132,22433,WATERING CAN GREEN DINOSAUR,6,5/23/2011 9:43,1.95,12360,Austria
554132,22431,WATERING CAN BLUE ELEPHANT,6,5/23/2011 9:43,1.95,12360,Austria
cat: Unable to write to output stream.
