In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder\
                    .appName("Inferred and explicit schemas")\
                    .getOrCreate()

In [3]:
from pyspark.sql.types import Row

In [4]:
lines = sc.textFile("../datasets/students.txt")

In [5]:
lines.collect()

['Emily,44,55,78', 'Andy,47,34,89', 'Rick,55,78,55', 'Aaron,66,34,98']

In [7]:
parts = lines.map(lambda l: l.split(","))
parts.collect()

[['Emily', '44', '55', '78'],
 ['Andy', '47', '34', '89'],
 ['Rick', '55', '78', '55'],
 ['Aaron', '66', '34', '98']]

In [8]:
students = parts.map(lambda p: Row(name=p[0], math=int(p[1]), english=int(p[2]), science=int(p[3])))

In [9]:
students.collect()

[Row(name='Emily', math=44, english=55, science=78),
 Row(name='Andy', math=47, english=34, science=89),
 Row(name='Rick', math=55, english=78, science=55),
 Row(name='Aaron', math=66, english=34, science=98)]

In [10]:
schemaStudents = spark.createDataFrame(students)
schemaStudents.createOrReplaceTempView("students")

In [11]:
schemaStudents.columns

['name', 'math', 'english', 'science']

In [12]:
schemaStudents.schema

StructType(List(StructField(name,StringType,true),StructField(math,LongType,true),StructField(english,LongType,true),StructField(science,LongType,true)))

In [13]:
spark.sql("SELECT * FROM students").show()

+-----+----+-------+-------+
| name|math|english|science|
+-----+----+-------+-------+
|Emily|  44|     55|     78|
| Andy|  47|     34|     89|
| Rick|  55|     78|     55|
|Aaron|  66|     34|     98|
+-----+----+-------+-------+



In [14]:
parts.collect()

[['Emily', '44', '55', '78'],
 ['Andy', '47', '34', '89'],
 ['Rick', '55', '78', '55'],
 ['Aaron', '66', '34', '98']]

In [15]:
schemaString = "name math english science"

In [16]:
 from pyspark.sql.types import StructType, StructField, StringType, LongType

fields = [StructField('name', StringType(), True),
          StructField('math', LongType(), True),
          StructField('english', LongType(), True),
          StructField('science', LongType(), True),
]

In [17]:
schema = StructType(fields)

In [18]:
schemaStudents = spark.createDataFrame(parts, schema)

In [19]:
schemaStudents.columns

['name', 'math', 'english', 'science']

In [20]:
schemaStudents.schema

StructType(List(StructField(name,StringType,true),StructField(math,LongType,true),StructField(english,LongType,true),StructField(science,LongType,true)))

In [21]:
spark.sql("SELECT * FROM students").show()

+-----+----+-------+-------+
| name|math|english|science|
+-----+----+-------+-------+
|Emily|  44|     55|     78|
| Andy|  47|     34|     89|
| Rick|  55|     78|     55|
|Aaron|  66|     34|     98|
+-----+----+-------+-------+

