# Using Spark Schemas for creating Spark DataFrames

In [1]:
from os import environ
# Import the SparkSession object
from pyspark.sql.session import SparkSession

# Define a schema using StructType, StructField, and other dataTypes
from pyspark.sql.types import (StructType, StructField,  
                               StringType, IntegerType)

In [2]:
spark = SparkSession.builder.appName('schema_changes').getOrCreate()
file_path = "file:///"+environ['DATA_LAKE']
df = spark.read.json(file_path+"people.json")

### Print schema details

In [3]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



### Create the list of schema fields for each column using StructField method

In [4]:
data_schema = [
                StructField(name = 'age' , dataType = IntegerType(), nullable = True), 
                StructField(name = 'name', dataType = StringType() , nullable = True)
              ]

### Create final structure using StructType method

In [5]:
final_struct = StructType(data_schema)

### Create data frame by defining the schema as StructType

In [7]:
new_df = spark.read.json(file_path+"people.json", schema = final_struct)

### Display the schema as defined

In [8]:
new_df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [9]:
new_df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [10]:
spark.stop()