# Part 1: DataFrame

In [1]:
import os
from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkContext, SparkConf

In [2]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import countDistinct, avg, stddev
from pyspark.sql.functions import dayofmonth, dayofyear, year, month, hour, weekofyear, date_format
from pyspark.sql.functions import col as func_col

In [3]:
app_name = 'dataFrame'
spark = SparkSession.builder.appName(app_name).getOrCreate() # singleton instance

## Datasets

In [4]:
json_file_path = '..\\pyspark-training\\data\\Employees.json'
csv_file_path = '..\pyspark-training\data\AppleStore.csv'

In [6]:
json_df = spark.read.json(json_file_path)

In [7]:
json_df.show(5, truncate=True)

+---+-------+------+
| ID|   Name|Salary|
+---+-------+------+
|  1|   John| 20000|
|  2|  Rohit| 15000|
|  3|  Parth| 14600|
|  4|Rishabh| 20500|
|  5|  Daisy| 34000|
+---+-------+------+
only showing top 5 rows



In [8]:
# head() will return a list of Row objects
json_df.head(5)

[Row(ID=1, Name='John', Salary=20000),
 Row(ID=2, Name='Rohit', Salary=15000),
 Row(ID=3, Name='Parth', Salary=14600),
 Row(ID=4, Name='Rishabh', Salary=20500),
 Row(ID=5, Name='Daisy', Salary=34000)]

In [9]:
json_df.printSchema()

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Salary: long (nullable = true)



In [10]:
json_df.columns

['ID', 'Name', 'Salary']

In [11]:
json_df.describe()

DataFrame[summary: string, ID: string, Salary: string]

In [12]:
json_df.describe().show()

+-------+----------------+------------------+
|summary|              ID|            Salary|
+-------+----------------+------------------+
|  count|              15|                15|
|   mean|             8.0|           25940.0|
| stddev|4.47213595499958|15075.183012393012|
|    min|               1|             14600|
|    max|              15|             70000|
+-------+----------------+------------------+



## DataFrame Basics

In [13]:
json_df['Salary']

Column<b'Salary'>

In [14]:
type(json_df['Salary'])

pyspark.sql.column.Column

Unlike python DataFrame, pyspark DataFrame requires select() function to actually get the data.

In [15]:
json_df.select(['Salary']).show(5)

+------+
|Salary|
+------+
| 20000|
| 15000|
| 14600|
| 20500|
| 34000|
+------+
only showing top 5 rows



In [16]:
json_df.select(['ID', 'Salary']).show(5)

+---+------+
| ID|Salary|
+---+------+
|  1| 20000|
|  2| 15000|
|  3| 14600|
|  4| 20500|
|  5| 34000|
+---+------+
only showing top 5 rows



In [17]:
# asias column names
json_df.select((json_df.ID + 1000).alias('new_id'), json_df.Salary.alias('Base Salary')).show(3)

+------+-----------+
|new_id|Base Salary|
+------+-----------+
|  1001|      20000|
|  1002|      15000|
|  1003|      14600|
+------+-----------+
only showing top 3 rows



### Adding new column
A different approach than Python DataFrame.

In [18]:
json_df.withColumn('Bonus', json_df['Salary'] * 0.05).show(3)

+---+-----+------+------+
| ID| Name|Salary| Bonus|
+---+-----+------+------+
|  1| John| 20000|1000.0|
|  2|Rohit| 15000| 750.0|
|  3|Parth| 14600| 730.0|
+---+-----+------+------+
only showing top 3 rows



**Note:** You may execute withColumn multiple times as it just overwrites the same column name everytime without any error. ALso, the changes made here are not permanent to DF.

In [19]:
print(json_df.columns)

['ID', 'Name', 'Salary']


### Renaming Columns

In [20]:
json_df.withColumnRenamed('Salary', 'New Salary').show(3)

+---+-----+----------+
| ID| Name|New Salary|
+---+-----+----------+
|  1| John|     20000|
|  2|Rohit|     15000|
|  3|Parth|     14600|
+---+-----+----------+
only showing top 3 rows



### Using SQL
To use SQL queries over dataframe, you need to register it as a temporary view.

In [21]:
json_df.createOrReplaceTempView('Emp')
sql_df = spark.sql('select * from Emp')
sql_df.show(3)

+---+-----+------+
| ID| Name|Salary|
+---+-----+------+
|  1| John| 20000|
|  2|Rohit| 15000|
|  3|Parth| 14600|
+---+-----+------+
only showing top 3 rows



In [22]:
result = spark.sql('select salary, salary * 0.05 as Bonus from Emp where Salary <= 15000')
result.show()

+------+------+
|salary| Bonus|
+------+------+
| 15000|750.00|
| 14600|730.00|
+------+------+



## DataFrame Operations

In [23]:
# Let Spark know about the header and infer the schema types.
# This is only available as option on CSVs and not on JSON files.
appStore_df = spark.read.csv(csv_file_path, inferSchema=True, header=True)
appStore_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- track_name: string (nullable = true)
 |-- size_bytes: long (nullable = true)
 |-- currency: string (nullable = true)
 |-- price: double (nullable = true)
 |-- rating_count_tot: integer (nullable = true)
 |-- rating_count_ver: integer (nullable = true)
 |-- user_rating: double (nullable = true)
 |-- user_rating_ver: double (nullable = true)
 |-- ver: string (nullable = true)
 |-- cont_rating: string (nullable = true)
 |-- prime_genre: string (nullable = true)
 |-- sup_devices.num: integer (nullable = true)
 |-- ipadSc_urls.num: integer (nullable = true)
 |-- lang.num: integer (nullable = true)
 |-- vpp_lic: integer (nullable = true)



In [24]:
appStore_df.head(3)

[Row(_c0=1, id=281656475, track_name='PAC-MAN Premium', size_bytes=100788224, currency='USD', price=3.99, rating_count_tot=21292, rating_count_ver=26, user_rating=4.0, user_rating_ver=4.5, ver='6.3.5', cont_rating='4+', prime_genre='Games', sup_devices.num=38, ipadSc_urls.num=5, lang.num=10, vpp_lic=1),
 Row(_c0=2, id=281796108, track_name='Evernote - stay organized', size_bytes=158578688, currency='USD', price=0.0, rating_count_tot=161065, rating_count_ver=26, user_rating=4.0, user_rating_ver=3.5, ver='8.2.2', cont_rating='4+', prime_genre='Productivity', sup_devices.num=37, ipadSc_urls.num=5, lang.num=23, vpp_lic=1),
 Row(_c0=3, id=281940292, track_name='WeatherBug - Local Weather, Radar, Maps, Alerts', size_bytes=100524032, currency='USD', price=0.0, rating_count_tot=188583, rating_count_ver=2822, user_rating=3.5, user_rating_ver=4.5, ver='5.0.0', cont_rating='4+', prime_genre='Weather', sup_devices.num=37, ipadSc_urls.num=5, lang.num=3, vpp_lic=1)]

In [25]:
appStore_df.select(['id', 'track_name', 'price', 'user_rating']).show(3)

+---------+--------------------+-----+-----------+
|       id|          track_name|price|user_rating|
+---------+--------------------+-----+-----------+
|281656475|     PAC-MAN Premium| 3.99|        4.0|
|281796108|Evernote - stay o...|  0.0|        4.0|
|281940292|WeatherBug - Loca...|  0.0|        3.5|
+---------+--------------------+-----+-----------+
only showing top 3 rows



In [26]:
appStore_df.count()

7197

In [27]:
# using SQL with select
appStore_df.filter('price > 9.99').select(['track_name', 'price']).show()

+--------------------+------+
|          track_name| price|
+--------------------+------+
|iReal Pro - Music...| 12.99|
|                 大辞林| 21.99|
|Proloquo2Go - Sym...|249.99|
|      NAVIGON Europe| 74.99|
|    Gaia GPS Classic| 19.99|
|AnkiMobile Flashc...| 24.99|
|Pocket Anatomy - ...| 13.99|
|iStatVball 2 iPad...| 19.99|
|   FINAL FANTASY III| 14.99|
|      Site Audit Pro| 16.99|
|FINAL FANTASY III...| 16.99|
|プチ・ロワイヤル仏和辞典（第4版）...| 47.99|
|    FL Studio Mobile| 13.99|
|          FiLMiC Pro| 14.99|
|Human Anatomy Atl...| 24.99|
|FINAL FANTASY TAC...| 13.99|
|         STEINS;GATE| 24.99|
|              Notion| 14.99|
|iDoceo - teacher'...| 11.99|
|Articulation Stat...| 59.99|
+--------------------+------+
only showing top 20 rows



### Complex filtering using Python operators for comparison
Syntaxlooks very similar to SQL operators, excep we need to ensure that we call the entire column with the dataframe, using the format: df['column name'].

In [28]:
appStore_df.filter((appStore_df['price'] > 9.99) & (appStore_df['user_rating'] == 5)).select(['track_name', 'price']).show()

+-----------------+-----+
|       track_name|price|
+-----------------+-----+
|   ウィズダム英和・和英辞典 2|23.99|
|         Model 15|29.99|
| SkySafari 5 Plus|14.99|
|AUM - Audio Mixer|18.99|
+-----------------+-----+



#### Conditional Operators
- | => or
- & => and
- \- => not (equivalent to ! in Python)

The result after filtering can be saved using collect(). It will be saved as a list.

In [29]:
high_tot_ratings = appStore_df.filter('rating_count_tot > 100000').select(['track_name', 'rating_count_tot']).collect()
type(high_tot_ratings)

list

Rows can be converted to a dictionary

In [30]:
row = high_tot_ratings[0]
row.asDict()

{'rating_count_tot': 161065, 'track_name': 'Evernote - stay organized'}

In [31]:
# get all values:
for item in row:
    print(item)

Evernote - stay organized
161065


## GroupBy, Agg

groupBy() returns a GroupedData object, of which various methods can be invoked.

In [32]:
appStore_df.groupBy('prime_genre')

<pyspark.sql.group.GroupedData at 0x20d5cf4bd68>

In [33]:
appStore_df.groupBy('prime_genre').mean('user_rating', 'rating_count_tot').show(5)

+-------------+------------------+---------------------+
|  prime_genre|  avg(user_rating)|avg(rating_count_tot)|
+-------------+------------------+---------------------+
|    Education| 3.376379690949227|   2239.2295805739514|
|   Navigation|2.6847826086956523|    11853.95652173913|
|Entertainment|3.2467289719626167|    7533.678504672897|
|       Sports| 2.982456140350877|   14026.929824561403|
| Food & Drink|3.1825396825396823|   13938.619047619048|
+-------------+------------------+---------------------+
only showing top 5 rows



To use alias, we use **functions.col** imported as func_col above.

In [34]:
appStore_df.groupBy('prime_genre').mean('user_rating').alias('avg rating').show(5)

+-------------+------------------+
|  prime_genre|  avg(user_rating)|
+-------------+------------------+
|    Education| 3.376379690949227|
|   Navigation|2.6847826086956523|
|Entertainment|3.2467289719626167|
|       Sports| 2.982456140350877|
| Food & Drink|3.1825396825396823|
+-------------+------------------+
only showing top 5 rows



In [35]:
appStore_df.groupBy('prime_genre').mean('user_rating').select('prime_genre', func_col('avg(user_rating)').alias('avg_rating')).show(5)

+-------------+------------------+
|  prime_genre|        avg_rating|
+-------------+------------------+
|    Education| 3.376379690949227|
|   Navigation|2.6847826086956523|
|Entertainment|3.2467289719626167|
|       Sports| 2.982456140350877|
| Food & Drink|3.1825396825396823|
+-------------+------------------+
only showing top 5 rows



Alternatively, **withColumnRenamed** can be used to rename an aggregate measure and generate alias.

In [36]:
appStore_df.groupBy('prime_genre').mean('user_rating').withColumnRenamed('avg(user_rating)', 'avg_rating').show(3)

+-------------+------------------+
|  prime_genre|        avg_rating|
+-------------+------------------+
|    Education| 3.376379690949227|
|   Navigation|2.6847826086956523|
|Entertainment|3.2467289719626167|
+-------------+------------------+
only showing top 3 rows



#### Using agg() to apply different functions across columns

In [37]:
appStore_df.groupBy('prime_genre').agg({'user_rating':'mean', 'price':'max'}).show()

+-----------------+----------+------------------+
|      prime_genre|max(price)|  avg(user_rating)|
+-----------------+----------+------------------+
|        Education|    299.99| 3.376379690949227|
|       Navigation|     74.99|2.6847826086956523|
|    Entertainment|      9.99|3.2467289719626167|
|           Sports|     19.99| 2.982456140350877|
|     Food & Drink|     27.99|3.1825396825396823|
|    Photo & Video|     22.99|3.8008595988538683|
|           Travel|      9.99| 3.376543209876543|
|          Finance|      5.99|2.4326923076923075|
|Social Networking|      9.99|2.9850299401197606|
|             Book|     27.99|2.4776785714285716|
|         Shopping|      1.99| 3.540983606557377|
|        Reference|     47.99|          3.453125|
| Health & Fitness|      9.99|               3.7|
|        Utilities|     24.99| 3.278225806451613|
|     Productivity|     99.99|  4.00561797752809|
|            Games|     29.99|3.6850077679958573|
|            Music|     49.99|3.9782608695652173|


Multiple aggregations using **agg** on the same column along with **alias** on aggregate measures

In [38]:
from pyspark.sql.functions import max as smax, min as smin, count, countDistinct
appStore_df.groupBy('prime_genre').agg(smax('price').alias('highest_price'),
                                       smin('price').alias('lowest_price'),
                                       countDistinct('price').alias('distinct_prices')).show(5)

+-------------+-------------+------------+---------------+
|  prime_genre|highest_price|lowest_price|distinct_prices|
+-------------+-------------+------------+---------------+
|    Education|       299.99|         0.0|             19|
|   Navigation|        74.99|         0.0|             10|
|Entertainment|         9.99|         0.0|              9|
|       Sports|        19.99|         0.0|              9|
| Food & Drink|        27.99|         0.0|             10|
+-------------+-------------+------------+---------------+
only showing top 5 rows



Apply aggregation functions directly on GroupedData without using "agg"

In [39]:
appStore_df.groupBy('prime_genre').mean('price', 'user_rating').show(5)

+-------------+------------------+------------------+
|  prime_genre|        avg(price)|  avg(user_rating)|
+-------------+------------------+------------------+
|    Education| 4.028233995584995| 3.376379690949227|
|   Navigation|4.1247826086956545|2.6847826086956523|
|Entertainment|0.8897009345794415|3.2467289719626167|
|       Sports|0.9530701754385958| 2.982456140350877|
| Food & Drink|1.5523809523809518|3.1825396825396823|
+-------------+------------------+------------------+
only showing top 5 rows



### Spark SQL Functions
Various of functions that we can import from pyspark.sql.functions.

```python
df.select(dayofmonth(df['Date'])).show()
```
Also: 
```python
monthofyear(), year(), countDistinct(), avg(), stddev()
```

### Order by

In [40]:
# Asending
json_df.orderBy('Salary').show(5)

+---+--------+------+
| ID|    Name|Salary|
+---+--------+------+
|  3|   Parth| 14600|
|  2|   Rohit| 15000|
| 12|    Riya| 17000|
| 13|   Krish| 17000|
| 14|Akanksha| 20000|
+---+--------+------+
only showing top 5 rows



In [41]:
# Desending
json_df.orderBy(json_df['Salary'].desc()).show(5)

+---+--------+------+
| ID|    Name|Salary|
+---+--------+------+
|  9|   Varun| 70000|
|  7|Sushmita| 50000|
|  5|   Daisy| 34000|
| 11| Johnson| 25500|
|  6|   Annie| 23000|
+---+--------+------+
only showing top 5 rows



### User Defined Functions

In [42]:
from pyspark.sql import SQLContext
# To register UDF, we need a SQLContext
# SQLContext is derived from a sparkContext
sc = spark.sparkContext
sql_ctx = SQLContext(sc)

In [43]:
sql_ctx.registerFunction('uname', lambda val: val.upper())

In [44]:
#spark.sql('select uname(Name), salary from Emp').show() 

## Nested JSON

In [45]:
nested = spark.read.json('..\\pyspark-training\\data\\nested.json')

In [46]:
nested.printSchema()

root
 |-- age: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- phones: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- reporting: struct (nullable = true)
 |    |-- manager: string (nullable = true)
 |    |-- rm: string (nullable = true)
 |-- salary: long (nullable = true)



In [47]:
phones = nested.select([explode('phones'), 'id'])
phones.show()

+-----+---+
|  col| id|
+-----+---+
|12345|  1|
|56789|  1|
+-----+---+



In [48]:
report = nested.select(nested['reporting'].alias('tmp'), nested['id']).select(['tmp.*', 'id'])
report.show()

+-------+-----+---+
|manager|   rm| id|
+-------+-----+---+
|  Steve|Brian|  1|
+-------+-----+---+



# Part 2: Schema Management
Shema helps to convert RDD to DataFrame.
## Inferring the Schma Using Reflection
A schema is the layout of the dataset in the form of attributes in the dataset and the datatype associated to that attribute. A schema provides the flexibility to switch between RDD and data frame.

In [49]:
# Import data types
from pyspark.sql.types import *
from pyspark.sql import Row

Load data and convert each line to a Row to supply schema to RDDs.

In [52]:
app_name = 'RDD Hands-on'
conf = SparkConf().setAppName(app_name)
sc = SparkContext.getOrCreate(conf=conf)

In [55]:
lines = sc.textFile('..\\pyspark-training\\data\\rdd_sample.csv')
parts = lines.map(lambda l: l.split(','))
parts.collect()

[['Amy', '32'],
 ['Roger', '36'],
 ['Jake', '27'],
 ['Brian', '21'],
 ['Jen', '30']]

In [56]:
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))
people.collect()

[Row(age=32, name='Amy'),
 Row(age=36, name='Roger'),
 Row(age=27, name='Jake'),
 Row(age=21, name='Brian'),
 Row(age=30, name='Jen')]

Infer schema and register the DataFrame as a table

In [57]:
schemaPeople = spark.createDataFrame(people)
schemaPeople.show()

+---+-----+
|age| name|
+---+-----+
| 32|  Amy|
| 36|Roger|
| 27| Jake|
| 21|Brian|
| 30|  Jen|
+---+-----+



In [58]:
schemaPeople.createOrReplaceTempView('people')
spark.sql('select * from people').show()

+---+-----+
|age| name|
+---+-----+
| 32|  Amy|
| 36|Roger|
| 27| Jake|
| 21|Brian|
| 30|  Jen|
+---+-----+



## Programmatically Specifying the Schema
Convert each line to a tuple.

In [59]:
people = parts.map(lambda p: (p[0], p[1].strip()))
people.collect()

[('Amy', '32'),
 ('Roger', '36'),
 ('Jake', '27'),
 ('Brian', '21'),
 ('Jen', '30')]

Specify schema encoded in a string

In [60]:
schemaString = 'name age'

Struct each field as string type; different types can be defined specifically to each column.

In [61]:
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)

Pass the tuple and apply the shema on the RDD to create DataFrame

In [62]:
schemaPeople = sqlContext.createDataFrame(people, schema)
schemaPeople.show()

+-----+---+
| name|age|
+-----+---+
|  Amy| 32|
|Roger| 36|
| Jake| 27|
|Brian| 21|
|  Jen| 30|
+-----+---+



| Inferring Schema | Specifying Schema |
| --- | --- |
| Convert RDD to list of **Rows** and **infer schema** | Convert RDD to list of **tuples** |
| Need not prepare schema | **Define schema string** and prepare **StructType** |
| Call **createDataFrame** without schema | call **createDataFrame** with schema |

## Hive Metastore
Spark SQL uses a Hive metastore to manage the metadata of persistent relational entities (e.g. databases, tables, columns, partitions) in a relational database (for fast access).

A Hive metastore warehouse (aka spark-warehouse) is the directory where Spark SQL persists tables whereas a Hive metastore (aka metastore_db) is a relational database to manage the metadata of the persistent relational entities, e.g. databases, tables, columns, partitions.

By default, Spark SQl uses the embedded deployment mode of a Hive metastore with a Apache Derbe database.

When SparkSession is created with Hive support the external catalog (aka metasotre) is HiveExternalCatalog. HiveExternalCatalog uses spark.sql.warehouse.dir directory for the location of the databases.

The benefits of using an external Hive metastore:
- Allow a multiple Spark applications (sessions) to access it concurrently
- Allow a single Spark application to use label statistics without running "ANNALYZE TABLE" every execution

***Spark SQL uses the Hive-specific configuration properties that further fine-tune the Hive integration, e.g. spark.sql.hive.metastore.version or spark.sql.hive.metastore.jars.***

**spark.sql.warehouse.dir** is a static configuration property that sets Hive's hive.metastore.warehouse.dir property, i.e. the locaitonof the Hive local/embedded metastore database (using Derby).