In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

In [2]:
spark

In [4]:
spark.sql("show databases").filter("namespace like '%itv007008%'")

namespace
itv007008_retail


In [5]:
spark.sql("use itv007008_retail")

In [6]:
spark.sql("show tables").show()

+----------------+----------+-----------+
|        database| tableName|isTemporary|
+----------------+----------+-----------+
|itv007008_retail|    orders|      false|
|itv007008_retail|orders_ext|      false|
+----------------+----------+-----------+



In [13]:
spark.sql("describe extended orders_ext").show(truncate=False)

+----------------------------+----------------------------------------------------------------+-------+
|col_name                    |data_type                                                       |comment|
+----------------------------+----------------------------------------------------------------+-------+
|order_id                    |int                                                             |null   |
|order_date                  |string                                                          |null   |
|customer_id                 |int                                                             |null   |
|order_status                |string                                                          |null   |
|                            |                                                                |       |
|# Detailed Table Information|                                                                |       |
|Database                    |itv007008_retail                  

In [11]:
spark.sql("select * from orders_ext").show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [14]:
spark.table("orders_ext").show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [15]:
spark.range(5)

id
0
1
2
3
4


In [17]:
spark.range(0,8,3)

id
0
3
6


In [19]:
spark.version

'3.0.1'

In [4]:
orders_list = [(1,'2013-07-25 00:00:00.0',11599,'CLOSED'),(2,'2013-07-25 00:00:00.0',256,'PENDING_PAYMENT'),(3,'2013-07-25 00:00:00.0',12111,'COMPLETE'
)]
orders_raw_df = spark.createDataFrame(orders_list)

In [5]:
orders_raw_df.show()

+---+--------------------+-----+---------------+
| _1|                  _2|   _3|             _4|
+---+--------------------+-----+---------------+
|  1|2013-07-25 00:00:...|11599|         CLOSED|
|  2|2013-07-25 00:00:...|  256|PENDING_PAYMENT|
|  3|2013-07-25 00:00:...|12111|       COMPLETE|
+---+--------------------+-----+---------------+



In [6]:
orders_raw_df.printSchema()

root
 |-- _1: long (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)
 |-- _4: string (nullable = true)



In [7]:
orders_raw_df = spark.createDataFrame(orders_list,schema="order_id long,order_date string, customer_id int, order_status string")

In [8]:
orders_raw_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
+--------+--------------------+-----------+---------------+



In [9]:
orders_raw_df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [10]:
### ToDf transformation ##
orders_raw_df = spark.createDataFrame(orders_list).toDF('order_id', 'order_date', 'customer_id', 'order_status')

In [11]:
orders_raw_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
+--------+--------------------+-----------+---------------+



In [12]:
from pyspark.sql.functions import *
new_df = orders_raw_df.withColumn("order_date_new", to_timestamp("order_date"))

In [13]:
new_df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_date_new: timestamp (nullable = true)



In [14]:
new_df.show()

+--------+--------------------+-----------+---------------+-------------------+
|order_id|          order_date|customer_id|   order_status|     order_date_new|
+--------+--------------------+-----------+---------------+-------------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|2013-07-25 00:00:00|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|2013-07-25 00:00:00|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|2013-07-25 00:00:00|
+--------+--------------------+-----------+---------------+-------------------+



In [3]:
## Convert RDD to Dataframe ####
orders_rdd = spark.sparkContext.textFile("/public/trendytech/retail_db/orders/part-00000")

In [4]:
orders_rdd.take(5)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE']

In [5]:
new_orders_rdd = orders_rdd.map(lambda x: (int(x.split(",")[0]), x.split(",")[1], int(x.split(",")[2]), x.split(",")[3]))

In [6]:
new_orders_rdd.take(5)

[(1, '2013-07-25 00:00:00.0', 11599, 'CLOSED'),
 (2, '2013-07-25 00:00:00.0', 256, 'PENDING_PAYMENT'),
 (3, '2013-07-25 00:00:00.0', 12111, 'COMPLETE'),
 (4, '2013-07-25 00:00:00.0', 8827, 'CLOSED'),
 (5, '2013-07-25 00:00:00.0', 11318, 'COMPLETE')]

In [7]:
new_df = spark.createDataFrame(new_orders_rdd, schema = "order_id long, order_date string, customer_id int, order_status string" )

In [8]:
new_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [9]:
todf_example = new_orders_rdd.toDF("order_id long, order_date string, customer_id int, order_status string")

In [10]:
todf_example.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [11]:
todf_example.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [12]:
#### Nested Schema ######
ddlSchema = "customer_id long, fullname struct<firstname:string,lastname:string>,city string"

In [13]:
customer_df = spark.read.json("/public/trendytech/datasets/customer_nested", schema=ddlSchema)

In [14]:
customer_df.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          2|    [ram, kumar]|hyderabad|
|          3|[vijay, shankar]|     pune|
|          1| [sumit, mittal]|bangalore|
+-----------+----------------+---------+



In [16]:
### Defining Schema via Structs ####
from pyspark.sql.types import *
customer_schema_struct = StructType([
    StructField("customer_id", LongType()),
               StructField("fullname", StructType([StructField("firstname", StringType()), StructField("lastname",StringType())])),
                                     StructField("city", StringType())
])

In [19]:
customer_df = spark.read.json("/public/trendytech/datasets/customer_nested", schema=customer_schema_struct)

In [20]:
customer_df.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          2|    [ram, kumar]|hyderabad|
|          3|[vijay, shankar]|     pune|
|          1| [sumit, mittal]|bangalore|
+-----------+----------------+---------+



In [21]:
customer_df.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- city: string (nullable = true)

