## Catalyst Optimizer

https://app.pluralsight.com/player?course=spark-2-getting-started&author=janani-ravi&name=9f634640-7cae-43f5-b952-e5c85d915cb5&clip=3&mode=live

# Inferred and Explicit Schemas

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder\
.appName("Inferred and explicit schemas")\
.getOrCreate()

In [4]:
from pyspark.sql.types import Row

In [5]:
lines = sc.textFile("/data/pluralsight_spark2/02/demos/datasets/students.txt")

In [6]:
lines.collect()

['Emily,44,55,78', 'Andy,47,34,89', 'Rick,55,78,55', 'Aaron,66,34,98']

In [8]:
parts = lines.map(lambda x: x.split(","))
parts.collect()

[['Emily', '44', '55', '78'],
 ['Andy', '47', '34', '89'],
 ['Rick', '55', '78', '55'],
 ['Aaron', '66', '34', '98']]

In [11]:
students = parts.map(lambda p: Row(name=p[0], math=int(p[1]), english=int(p[2]), science=int(p[3])))

In [12]:
students.collect()

[Row(english=55, math=44, name='Emily', science=78),
 Row(english=34, math=47, name='Andy', science=89),
 Row(english=78, math=55, name='Rick', science=55),
 Row(english=34, math=66, name='Aaron', science=98)]

In [13]:
schemaStudents = spark.createDataFrame(students)
schemaStudents.createOrReplaceTempView("students")

In [15]:
schemaStudents.columns

['english', 'math', 'name', 'science']

# Uses Reflection to infer schema

In [16]:
schemaStudents.schema

StructType(List(StructField(english,LongType,true),StructField(math,LongType,true),StructField(name,StringType,true),StructField(science,LongType,true)))

In [19]:
spark.sql("select name, english, math, science from students").show()

+-----+-------+----+-------+
| name|english|math|science|
+-----+-------+----+-------+
|Emily|     55|  44|     78|
| Andy|     34|  47|     89|
| Rick|     78|  55|     55|
|Aaron|     34|  66|     98|
+-----+-------+----+-------+



# Explicitly defined schema

In [20]:
parts.collect()

[['Emily', '44', '55', '78'],
 ['Andy', '47', '34', '89'],
 ['Rick', '55', '78', '55'],
 ['Aaron', '66', '34', '98']]

In [21]:
schemastring = "name math english science"

In [28]:
from pyspark.sql.types import StructType, StructField, StringType, LongType

fields =[
    StructField('name', StringType(), True),
    StructField('math', LongType(), True),
    StructField('english', LongType(), True),
    StructField('science', LongType(), True),
    
]

In [29]:
schema = StructType(fields)

In [30]:
schemaStudents = spark.createDataFrame(parts, schema)

In [31]:
schemaStudents.columns

['name', 'math', 'english', 'science']

In [32]:
schemaStudents.schema

StructType(List(StructField(name,StringType,true),StructField(math,LongType,true),StructField(english,LongType,true),StructField(science,LongType,true)))

In [35]:
spark.sql("select name, english, math, science from students").show()

+-----+-------+----+-------+
| name|english|math|science|
+-----+-------+----+-------+
|Emily|     55|  44|     78|
| Andy|     34|  47|     89|
| Rick|     78|  55|     55|
|Aaron|     34|  66|     98|
+-----+-------+----+-------+

