In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder\
.appName("Analyzing airline data")\
.getOrCreate()

In [3]:
from pyspark.sql.types import Row
from datetime import datetime

In [4]:
record = sc.parallelize(
[
 Row(
     id = 1,
     name = "Jill",
     active = True,
     clubs = ['chess', 'hockey'],
     subjects = {"math": 80, 'english': 56},
     enrolled = datetime(2014, 8, 1, 14, 1, 5)
),   
 Row(
     id = 2,
     name = "George",
     active = False,
     clubs = ['chess', 'soccer'],
     subjects = {"math": 60, 'english': 96},
     enrolled = datetime(2015, 3, 21, 8, 2, 5)
)   
    
])

In [5]:
record_df = record.toDF()
record_df.show()

+------+---------------+-------------------+---+------+--------------------+
|active|          clubs|           enrolled| id|  name|            subjects|
+------+---------------+-------------------+---+------+--------------------+
|  true|[chess, hockey]|2014-08-01 14:01:05|  1|  Jill|[english -> 56, m...|
| false|[chess, soccer]|2015-03-21 08:02:05|  2|George|[english -> 96, m...|
+------+---------------+-------------------+---+------+--------------------+



# Register Dataframe as a Table

In [6]:
record_df.createOrReplaceTempView("records")

In [7]:
all_records_df = sqlContext.sql('SELECT * FROM records')
all_records_df.show()

+------+---------------+-------------------+---+------+--------------------+
|active|          clubs|           enrolled| id|  name|            subjects|
+------+---------------+-------------------+---+------+--------------------+
|  true|[chess, hockey]|2014-08-01 14:01:05|  1|  Jill|[english -> 56, m...|
| false|[chess, soccer]|2015-03-21 08:02:05|  2|George|[english -> 96, m...|
+------+---------------+-------------------+---+------+--------------------+



In [10]:
sqlContext.sql('SELECT id, clubs[0], subjects["english"] FROM records').show()

+---+--------+-----------------+
| id|clubs[0]|subjects[english]|
+---+--------+-----------------+
|  1|   chess|               56|
|  2|   chess|               96|
+---+--------+-----------------+



In [17]:
sqlContext.sql('select id, NOT active from records').show()

+---+------------+
| id|(NOT active)|
+---+------------+
|  1|       false|
|  2|        true|
+---+------------+



In [18]:
sqlContext.sql('select * from records where active').show()

+------+---------------+-------------------+---+----+--------------------+
|active|          clubs|           enrolled| id|name|            subjects|
+------+---------------+-------------------+---+----+--------------------+
|  true|[chess, hockey]|2014-08-01 14:01:05|  1|Jill|[english -> 56, m...|
+------+---------------+-------------------+---+----+--------------------+



In [21]:
sqlContext.sql('select * from records where subjects["english"] > 80').show()

+------+---------------+-------------------+---+------+--------------------+
|active|          clubs|           enrolled| id|  name|            subjects|
+------+---------------+-------------------+---+------+--------------------+
| false|[chess, soccer]|2015-03-21 08:02:05|  2|George|[english -> 96, m...|
+------+---------------+-------------------+---+------+--------------------+



# Register the table as accessible accross all spark sessions

In [22]:
record_df.createGlobalTempView("global_records")

In [23]:
sqlContext.sql('select * from global_temp.global_records where active').show()

+------+---------------+-------------------+---+----+--------------------+
|active|          clubs|           enrolled| id|name|            subjects|
+------+---------------+-------------------+---+----+--------------------+
|  true|[chess, hockey]|2014-08-01 14:01:05|  1|Jill|[english -> 56, m...|
+------+---------------+-------------------+---+----+--------------------+

