In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('instance').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/23 11:54:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Validate datasets for reading from files using Spark API

In [29]:
schema = """
    order_id INT,
    order_date TIMESTAMP,
    order_customer_id INT,
    order_status STRING
"""
orders = spark.read.schema(schema).csv('/Users/adhoc/git/retail_db/orders/*')
orders.show()
orders.printSchema()
orders.dtypes

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

[('order_id', 'int'),
 ('order_date', 'timestamp'),
 ('order_customer_id', 'int'),
 ('order_status', 'string')]

## Convert data to json and parquet using Spark API

In [64]:
username = 'adhoc'

input_dir = f'/Users/{username}/git/retail_db/orders'
output_parquet_dir = f'/Users/{username}/git/retail_db/parquet/orders'
output_json_dir = f'/Users/{username}/git/retail_db/json/orders'
input_dir, output_parquet_dir, output_json_dir

('/Users/adhoc/git/retail_db/orders',
 '/Users/adhoc/git/retail_db/parquet/orders',
 '/Users/adhoc/git/retail_db/json/orders')

In [30]:
df = spark.read.schema(schema).csv(input_dir)
#df.coalesce(1).write.parquet(output_parquet_dir, mode='overwrite')
#df.coalesce(1).write.json(output_json_dir, mode='overwrite')

In [28]:
orders = spark.read.parquet(output_dir)
orders.show()
orders.dtypes

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

[('order_id', 'int'),
 ('order_date', 'timestamp'),
 ('order_customer_id', 'int'),
 ('order_status', 'string')]

## Convert comma separated files to pipe separated files using Spark API

In [29]:
input_dir = '/Users/adhoc/git/retail_db/orders'
output_dir = '/Users/adhoc/git/retail_db/csv/pipe/orders'

df = spark.read.schema(schema).csv(input_dir)
df.coalesce(1).write.mode('overwrite').csv(output_dir, sep='|')

In [31]:
#spark.read.schema(schema).csv(output_dir).show() #nimported as NULLs due to invalid separator
spark.read.schema(schema).csv(output_dir, sep='|').show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

## Overview of Reading Data Files into Spark Data Frames
* reading files direct API such as `json`, `csv` under `spark.read`
* reading files using `format` and `load` under `spark.read`
* specifying options as arguments as well as functions such as `option` and `options`
* supported file formats: `csv`, `text`, `json`, `parquet`, `orc` for `avro`, `xml` additional modules required
* for certification imporatant formats are `csv`, `json`, `parquet`
* reading compressed files  

## Steps to follow to read data from files into Spark Data Frame
* Check if the files are compressed (`gz`, `bzip2`, `snappy`, etc.) - `gz` and `snappy` are the most common ones
* Understand the file format (`txt`, `json`, `avro`, `parquet`, `orc`, etc.) - sometimes file extension is not present
* For `text` or `csv` determine the right separator
* Use appropriate API under `spark.read`

In [3]:
input_dir = '/Users/adhoc/git/retail_db/orders'
spark.read.text(input_dir).show(truncate=False)

+---------------------------------------------+
|value                                        |
+---------------------------------------------+
|1,2013-07-25 00:00:00.0,11599,CLOSED         |
|2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT  |
|3,2013-07-25 00:00:00.0,12111,COMPLETE       |
|4,2013-07-25 00:00:00.0,8827,CLOSED          |
|5,2013-07-25 00:00:00.0,11318,COMPLETE       |
|6,2013-07-25 00:00:00.0,7130,COMPLETE        |
|7,2013-07-25 00:00:00.0,4530,COMPLETE        |
|8,2013-07-25 00:00:00.0,2911,PROCESSING      |
|9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT |
|10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT|
|11,2013-07-25 00:00:00.0,918,PAYMENT_REVIEW  |
|12,2013-07-25 00:00:00.0,1837,CLOSED         |
|13,2013-07-25 00:00:00.0,9149,PENDING_PAYMENT|
|14,2013-07-25 00:00:00.0,9842,PROCESSING     |
|15,2013-07-25 00:00:00.0,2568,COMPLETE       |
|16,2013-07-25 00:00:00.0,7276,PENDING_PAYMENT|
|17,2013-07-25 00:00:00.0,2667,COMPLETE       |
|18,2013-07-25 00:00:00.0,1205,CLOSED   

## Reading data from CSV files into Spark Data Frame
* There are multiple approaches which can be used
* Approach 1: spark.read.csv(`path_to_folder`)
* Approach 2: spark.read.format('csv').load(`path_to_file`)
* The schema can be specified explicitely using `string` or `StructType`
* The delimiter separtor can be set explicitely the default one is `,`
* If the files have header the Data Frame with schema can be created using `header` or `inferSchema` - it will pick up column names for the headers while the data types with be inferred based on the data
* If the file does not have header the schema can be provided by passing column names using `toDF` and by using `inferSchema` option

In [6]:
input_dir = '/Users/adhoc/git/retail_db/orders'
spark.read.csv(input_dir).show()
spark.read.csv(input_dir).dtypes

+---+--------------------+-----+---------------+
|_c0|                 _c1|  _c2|            _c3|
+---+--------------------+-----+---------------+
|  1|2013-07-25 00:00:...|11599|         CLOSED|
|  2|2013-07-25 00:00:...|  256|PENDING_PAYMENT|
|  3|2013-07-25 00:00:...|12111|       COMPLETE|
|  4|2013-07-25 00:00:...| 8827|         CLOSED|
|  5|2013-07-25 00:00:...|11318|       COMPLETE|
|  6|2013-07-25 00:00:...| 7130|       COMPLETE|
|  7|2013-07-25 00:00:...| 4530|       COMPLETE|
|  8|2013-07-25 00:00:...| 2911|     PROCESSING|
|  9|2013-07-25 00:00:...| 5657|PENDING_PAYMENT|
| 10|2013-07-25 00:00:...| 5648|PENDING_PAYMENT|
| 11|2013-07-25 00:00:...|  918| PAYMENT_REVIEW|
| 12|2013-07-25 00:00:...| 1837|         CLOSED|
| 13|2013-07-25 00:00:...| 9149|PENDING_PAYMENT|
| 14|2013-07-25 00:00:...| 9842|     PROCESSING|
| 15|2013-07-25 00:00:...| 2568|       COMPLETE|
| 16|2013-07-25 00:00:...| 7276|PENDING_PAYMENT|
| 17|2013-07-25 00:00:...| 2667|       COMPLETE|
| 18|2013-07-25 00:0

[('_c0', 'string'), ('_c1', 'string'), ('_c2', 'string'), ('_c3', 'string')]

## Specifying schema while reading csv files into Spark Data Frame

In [10]:
input_dir = '/Users/adhoc/git/retail_db/orders'
schema = """
    order_id INT,
    order_date TIMESTAMP,
    order_customer_id INT,
    order_status STRING
"""
spark.read.schema(schema).csv(input_dir).show()
spark.read.csv(input_dir, schema=schema).show()
spark.read.format('csv').load(input_dir, schema=schema).show()
spark.read.schema(schema).csv(input_dir).dtypes

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

[('order_id', 'int'),
 ('order_date', 'timestamp'),
 ('order_customer_id', 'int'),
 ('order_status', 'string')]

In [15]:
from pyspark.sql.types import StructType, StructField, IntegerType, TimestampType, StringType
schema = StructType([
    StructField('order_id', IntegerType()),
    StructField('order_date', TimestampType()),
    StructField('order_customer_id', IntegerType()),
    StructField('order_status', StringType())
])
spark.read.csv(input_dir, schema=schema).show()
spark.read.schema(schema).csv(input_dir).show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

## Using `csv`, `toDF` and `inferSchema` to create Spark Data Frame

In [25]:
input_dir = '/Users/adhoc/git/retail_db/orders'
columns = ['order_id', 'order_date', 'order_customer_id', 'order_status']

spark.read.option('inferSchema', True).csv(input_dir).toDF(*columns).show(truncate=False)
spark.read.option('inferSchema', True).csv(input_dir).toDF(*columns).dtypes

+--------+-------------------+-----------------+---------------+
|order_id|order_date         |order_customer_id|order_status   |
+--------+-------------------+-----------------+---------------+
|1       |2013-07-25 00:00:00|11599            |CLOSED         |
|2       |2013-07-25 00:00:00|256              |PENDING_PAYMENT|
|3       |2013-07-25 00:00:00|12111            |COMPLETE       |
|4       |2013-07-25 00:00:00|8827             |CLOSED         |
|5       |2013-07-25 00:00:00|11318            |COMPLETE       |
|6       |2013-07-25 00:00:00|7130             |COMPLETE       |
|7       |2013-07-25 00:00:00|4530             |COMPLETE       |
|8       |2013-07-25 00:00:00|2911             |PROCESSING     |
|9       |2013-07-25 00:00:00|5657             |PENDING_PAYMENT|
|10      |2013-07-25 00:00:00|5648             |PENDING_PAYMENT|
|11      |2013-07-25 00:00:00|918              |PAYMENT_REVIEW |
|12      |2013-07-25 00:00:00|1837             |CLOSED         |
|13      |2013-07-25 00:0

In [26]:
spark.read.csv(input_dir, inferSchema=True).toDF(*columns).show(truncate=False)
spark.read.csv(input_dir, inferSchema=True).toDF(*columns).dtypes

+--------+-------------------+-----------------+---------------+
|order_id|order_date         |order_customer_id|order_status   |
+--------+-------------------+-----------------+---------------+
|1       |2013-07-25 00:00:00|11599            |CLOSED         |
|2       |2013-07-25 00:00:00|256              |PENDING_PAYMENT|
|3       |2013-07-25 00:00:00|12111            |COMPLETE       |
|4       |2013-07-25 00:00:00|8827             |CLOSED         |
|5       |2013-07-25 00:00:00|11318            |COMPLETE       |
|6       |2013-07-25 00:00:00|7130             |COMPLETE       |
|7       |2013-07-25 00:00:00|4530             |COMPLETE       |
|8       |2013-07-25 00:00:00|2911             |PROCESSING     |
|9       |2013-07-25 00:00:00|5657             |PENDING_PAYMENT|
|10      |2013-07-25 00:00:00|5648             |PENDING_PAYMENT|
|11      |2013-07-25 00:00:00|918              |PAYMENT_REVIEW |
|12      |2013-07-25 00:00:00|1837             |CLOSED         |
|13      |2013-07-25 00:0

[('order_id', 'int'),
 ('order_date', 'timestamp'),
 ('order_customer_id', 'int'),
 ('order_status', 'string')]

## Specyfing delimiter while creating Spark Data Frame using csv API

In [51]:
input_dir = '/Users/adhoc/git/retail_db/csv/pipe/orders'
schema = """
    order_id INT,
    order_date TIMESTAMP,
    order_customer_id INT,
    order_status STRING
"""
spark.read.csv(input_dir, schema=schema, sep='|').show()
spark.read.csv(input_dir, schema=schema, sep='|').dtypes

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

[('order_id', 'int'),
 ('order_date', 'timestamp'),
 ('order_customer_id', 'int'),
 ('order_status', 'string')]

In [52]:
spark.read.schema(schema).csv(input_dir, sep='|').show()
spark.read.schema(schema).csv(input_dir, sep='|').dtypes

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

[('order_id', 'int'),
 ('order_date', 'timestamp'),
 ('order_customer_id', 'int'),
 ('order_status', 'string')]

In [53]:
spark.read.schema(schema).option('sep', '|').csv(input_dir).show()
spark.read.schema(schema).option('sep', '|').csv(input_dir).dtypes

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

[('order_id', 'int'),
 ('order_date', 'timestamp'),
 ('order_customer_id', 'int'),
 ('order_status', 'string')]

## Using options while reading CSV files into Spark Data Frame
While creating Spark Data Frame the options can be passed using:
* keyword arguments as part of APIs - as part of `load` or direct API (e.g. `csv`)
* `spark.load.option`
* `spark.load.options`
if key in `option` is incorrect then it will be ignored
The available options depend on the file format

In [50]:
input_dir = '/Users/adhoc/git/retail_db/csv/pipe/orders'

spark.read.csv(input_dir).show(truncate=False) # ',' separator by default, so data loaded as rows

+-----------------------------------------------------+
|_c0                                                  |
+-----------------------------------------------------+
|1|2013-07-25T00:00:00.000+02:00|11599|CLOSED         |
|2|2013-07-25T00:00:00.000+02:00|256|PENDING_PAYMENT  |
|3|2013-07-25T00:00:00.000+02:00|12111|COMPLETE       |
|4|2013-07-25T00:00:00.000+02:00|8827|CLOSED          |
|5|2013-07-25T00:00:00.000+02:00|11318|COMPLETE       |
|6|2013-07-25T00:00:00.000+02:00|7130|COMPLETE        |
|7|2013-07-25T00:00:00.000+02:00|4530|COMPLETE        |
|8|2013-07-25T00:00:00.000+02:00|2911|PROCESSING      |
|9|2013-07-25T00:00:00.000+02:00|5657|PENDING_PAYMENT |
|10|2013-07-25T00:00:00.000+02:00|5648|PENDING_PAYMENT|
|11|2013-07-25T00:00:00.000+02:00|918|PAYMENT_REVIEW  |
|12|2013-07-25T00:00:00.000+02:00|1837|CLOSED         |
|13|2013-07-25T00:00:00.000+02:00|9149|PENDING_PAYMENT|
|14|2013-07-25T00:00:00.000+02:00|9842|PROCESSING     |
|15|2013-07-25T00:00:00.000+02:00|2568|COMPLETE 

In [6]:
# schema, sep, quote, header, mode (to deal with corrupted records)
# inferSchema, ignoring spaces, null values, multiLine, etc. 
spark.read.csv?

[0;31mSignature:[0m
[0mspark[0m[0;34m.[0m[0mread[0m[0;34m.[0m[0mcsv[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpath[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mschema[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mpyspark[0m[0;34m.[0m[0msql[0m[0;34m.[0m[0mtypes[0m[0;34m.[0m[0mStructType[0m[0;34m,[0m [0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msep[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencoding[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mquote[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m

In [42]:
orders = spark. \
    read. \
    csv(
        input_dir,
        sep='|',
        header = None,
        inferSchema = True,
    ). \
    toDF('order_id', 'order_date', 'order_customer_id', 'order_status')
orders.show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

In [12]:
orders = spark. \
    read. \
    format('csv'). \
    load(
        input_dir,
        sep='|',
        header = None,
        inferSchema = True,
    ). \
    toDF('order_id', 'order_date', 'order_customer_id', 'order_status')
orders.show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

In [18]:
orders = spark. \
    read. \
    option('sep', '|'). \
    option('header',  None). \
    option('inferSchema', True). \
    format('csv'). \
    load(input_dir). \
    toDF('order_id', 'order_date', 'order_customer_id', 'order_status')
orders.show()
orders.dtypes

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

[('order_id', 'int'),
 ('order_date', 'timestamp'),
 ('order_customer_id', 'int'),
 ('order_status', 'string')]

In [21]:
orders = spark. \
    read. \
    options(sep='|', header=None, inferSchema=True). \
    format('csv'). \
    load(input_dir). \
    toDF('order_id', 'order_date', 'order_customer_id', 'order_status')
orders.show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

In [51]:
# passing options as dict into options (kwargs)
options = {
    'sep': '|',
    'header': None, 
    'inferSchema': True,
}

orders = spark. \
    read. \
    options(**options). \
    format('csv'). \
    load(input_dir). \
    toDF('order_id', 'order_date', 'order_customer_id', 'order_status')
orders.show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

In [52]:
orders = spark. \
    read. \
    options(**options). \
    load(input_dir, format='csv'). \
    toDF('order_id', 'order_date', 'order_customer_id', 'order_status')
orders.show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

## Reading json files into Spark Data Frames

In [53]:
input_dir = '/Users/adhoc/git/retail_db/json/orders'
df = spark.read.json(input_dir)
df.show()
df.dtypes

+-----------------+--------------------+--------+---------------+
|order_customer_id|          order_date|order_id|   order_status|
+-----------------+--------------------+--------+---------------+
|            11599|2013-07-25T00:00:...|       1|         CLOSED|
|              256|2013-07-25T00:00:...|       2|PENDING_PAYMENT|
|            12111|2013-07-25T00:00:...|       3|       COMPLETE|
|             8827|2013-07-25T00:00:...|       4|         CLOSED|
|            11318|2013-07-25T00:00:...|       5|       COMPLETE|
|             7130|2013-07-25T00:00:...|       6|       COMPLETE|
|             4530|2013-07-25T00:00:...|       7|       COMPLETE|
|             2911|2013-07-25T00:00:...|       8|     PROCESSING|
|             5657|2013-07-25T00:00:...|       9|PENDING_PAYMENT|
|             5648|2013-07-25T00:00:...|      10|PENDING_PAYMENT|
|              918|2013-07-25T00:00:...|      11| PAYMENT_REVIEW|
|             1837|2013-07-25T00:00:...|      12|         CLOSED|
|         

[('order_customer_id', 'bigint'),
 ('order_date', 'string'),
 ('order_id', 'bigint'),
 ('order_status', 'string')]

In [46]:
df = spark.read.format('json').load(input_dir)
df.show()

+-----------------+--------------------+--------+---------------+
|order_customer_id|          order_date|order_id|   order_status|
+-----------------+--------------------+--------+---------------+
|            11599|2013-07-25T00:00:...|       1|         CLOSED|
|              256|2013-07-25T00:00:...|       2|PENDING_PAYMENT|
|            12111|2013-07-25T00:00:...|       3|       COMPLETE|
|             8827|2013-07-25T00:00:...|       4|         CLOSED|
|            11318|2013-07-25T00:00:...|       5|       COMPLETE|
|             7130|2013-07-25T00:00:...|       6|       COMPLETE|
|             4530|2013-07-25T00:00:...|       7|       COMPLETE|
|             2911|2013-07-25T00:00:...|       8|     PROCESSING|
|             5657|2013-07-25T00:00:...|       9|PENDING_PAYMENT|
|             5648|2013-07-25T00:00:...|      10|PENDING_PAYMENT|
|              918|2013-07-25T00:00:...|      11| PAYMENT_REVIEW|
|             1837|2013-07-25T00:00:...|      12|         CLOSED|
|         

In [47]:
df.inputFiles()

['file:///Users/adhoc/git/retail_db/json/orders/part-00000-72a3482e-2063-43c8-9677-b7913ee1571e-c000.json']

## Specyfing schema while reading json files into Spark Data Frame

In [58]:
input_dir = '/Users/adhoc/git/retail_db/json/orders'
schema = """
    order_id INT,
    order_date TIMESTAMP,
    order_customer_id INT,
    order_status STRING
"""
df = spark.read.schema(schema).json(input_dir)
df.show()
df.dtypes

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

[('order_id', 'int'),
 ('order_date', 'timestamp'),
 ('order_customer_id', 'int'),
 ('order_status', 'string')]

In [60]:
df = spark.read.json(input_dir, schema=schema)
df.show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

In [63]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
schema = StructType([
    StructField('order_id', IntegerType()),
    StructField('order_date', TimestampType()),
    StructField('order_customer_id', IntegerType()),
    StructField('order_status', StringType()),
])
spark.read.json(input_dir, schema=schema).show()
spark.read.json(input_dir, schema=schema).dtypes

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

[('order_id', 'int'),
 ('order_date', 'timestamp'),
 ('order_customer_id', 'int'),
 ('order_status', 'string')]

## Side effects of infering schema while creating Spark Data Frame
* `inferSchema` is used to entire data
* in case the data volume is huge the time spend to determine data types can be significant
* when the schema is specified explicitely the data will not be read to determine the data types while creating Spark Data Frame
* the schema can be explicitely speficied as `String` or `StructType` 
* `inferSchema` is used by default for `json`, `parquet`, `orc`. For Parquet and ORC column names and data types are inferred using metadata associated with the files
* inferring schema for CSV will create Data Frames with system generated column names. If `inferSchema` is used then the types will be determined. If the files contain the header then the column names can be derived from it. If not the column names can be provided usint `toDF` function. 

## Reading parquet files into Spark Data Frame

In [70]:
input_dir = '/Users/adhoc/git/retail_db/parquet/orders'
df = spark.read.parquet(input_dir)
df.show()
df.inputFiles(), df.dtypes

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

(['file:///Users/adhoc/git/retail_db/parquet/orders/part-00000-fd4e1ff5-59c5-448c-a014-abcb999bf248-c000.snappy.parquet'],
 [('order_id', 'int'),
  ('order_date', 'timestamp'),
  ('order_customer_id', 'int'),
  ('order_status', 'string')])

In [71]:
spark.read.format('parquet').load(input_dir).show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

## Explicitly specifying schema while reading parquet files into Spark Data Frame 

In [74]:
input_dir = '/Users/adhoc/git/retail_db/parquet/orders'

schema = """
    order_id INT,
    order_date TIMESTAMP,
    order_customer_id INT,
    order_status STRING
"""

spark.read.schema(schema).parquet(input_dir).show()
spark.read.schema(schema).parquet(input_dir).dtypes

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

[('order_id', 'int'),
 ('order_date', 'timestamp'),
 ('order_customer_id', 'int'),
 ('order_status', 'string')]

In [76]:
spark.read.parquet(input_dir, schema=schema).show()
spark.read.parquet(input_dir, schema=schema).dtypes

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

[('order_id', 'int'),
 ('order_date', 'timestamp'),
 ('order_customer_id', 'int'),
 ('order_status', 'string')]

In [78]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
schema = StructType([
    StructField('order_id', IntegerType()),
    StructField('order_date', TimestampType()),
    StructField('order_customer_id', IntegerType()),
    StructField('order_status', StringType()),
])
spark.read.parquet(input_dir, schema=schema).show()
spark.read.parquet(input_dir, schema=schema).dtypes

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

[('order_id', 'int'),
 ('order_date', 'timestamp'),
 ('order_customer_id', 'int'),
 ('order_status', 'string')]