In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [2]:
df = spark.read.json('file:/E:/code/git-2018/ETL-Workflow/ETL-Examples/src/main/python/resources/people.json')
# Displays the content of the DataFrame to stdout
df.show()
# Print the schema in a tree format
df.printSchema()
# Select only the "name" column

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [3]:
# get dataframe column names
df_columns = df.columns
print(df_columns)

['age', 'name']


In [4]:
# get datatypes of columns
df_types = df.dtypes
print(df_types)

[('age', 'bigint'), ('name', 'string')]


In [5]:
# Select only the "name" column
df.select("name").show()

# Select everybody, but increment the age by 1
df.select(df['name'], df['age'] + 1).show()

# Select people older than 21
df.filter(df['age'] > 21).show()

# Count people by age
df.groupBy("age").count().show()

# Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("people")
    
sqlDF = spark.sql('SELECT * FROM people')
sqlDF.show()
    
# Register the DataFrame as a global temporary view
df.createGlobalTempView("people")
    
# Global temporary view is tied to a system preserved database `global_temp`
spark.sql("SELECT * FROM global_temp.people").show()
    
# Global temporary view is cross-session
spark.newSession().sql("SELECT * FROM global_temp.people").show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|Michael|     null|
|   Andy|       31|
| Justin|       20|
+-------+---------+

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [10]:
help(spark.sparkContext)

AttributeError: 'SparkSession' object has no attribute 'sqlcontext'