In [1]:
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext

# Import data types
from pyspark.sql.types import *

In [2]:
cwd = os.getcwd()
for part in cwd.split('/'):
    if part.lower().startswith('edureka'):
        user_id = part.title()
user_id
app_name = '{0} : Schema Mgmt'.format(user_id)
app_name

'Edureka_121039 : Schema Mgmt'

In [3]:
spark = SparkSession.builder.appName(app_name).getOrCreate()
sparkContext = spark.sparkContext
sqlContext = SQLContext(sparkContext)

In [4]:
def get_hdfs_filepath(file_name):
    my_hdfs = '/user/{0}'.format(user_id.lower())
    return os.path.join(my_hdfs, file_name)

### People Dataset

In [5]:
PEOPLE_TXT = get_hdfs_filepath('people.txt')

### Inferring the Schema Using Reflection
Load people file and convert each line to a Row.

In [6]:
lines = sparkContext.textFile(PEOPLE_TXT)
parts = lines.map(lambda l: l.split(","))

Convert each line to a Row.

In [7]:
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))
people.collect()

[Row(age=30, name=u'Brian'),
 Row(age=32, name=u'Adam'),
 Row(age=23, name=u'Steve'),
 Row(age=25, name=u'Mohan'),
 Row(age=27, name=u'Kevin'),
 Row(age=40, name=u'Sundar')]

Infer schema and register the DataFrame as a table

In [8]:
schemaPeople = spark.createDataFrame(people)
schemaPeople.createOrReplaceTempView("people")

In [9]:
results = spark.sql("SELECT name FROM people")

In [10]:
peopleNames = results.rdd.map(lambda p: "Name: " + p.name).collect()

In [11]:
for name in peopleNames:
    print(name)

Name: Brian
Name: Adam
Name: Steve
Name: Mohan
Name: Kevin
Name: Sundar


### Programmatically Specifying the Schema

Convert each line to a tuple.

In [17]:
people = parts.map(lambda p: (p[0], p[1].strip()))
people.collect()

[(u'Brian', u'30'),
 (u'Adam', u'32'),
 (u'Steve', u'23'),
 (u'Mohan', u'25'),
 (u'Kevin', u'27'),
 (u'Sundar', u'40')]

Specify schema encoded in a string.

In [14]:
schemaString = "name age"

In [15]:
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)

What types are available in PySpark? Refer [this](http://spark.apache.org/docs/2.1.1/api/python/_modules/pyspark/sql/types.html "PySpark SQL Types") list for details.

Apply the schema to the RDD.

In [16]:
schemaPeople = sqlContext.createDataFrame(people, schema)

Creates a temporary view using the DataFrame

In [17]:
schemaPeople.createOrReplaceTempView("people")

Run SQL over DataFrame registered as a table.

In [18]:
results = spark.sql("SELECT name FROM people")

In [19]:
peopleNames = results.rdd.map(lambda p: "Name: " + p.name).collect()

In [20]:
for name in peopleNames:
    print(name)

Name: Brian
Name: Adam
Name: Steve
Name: Mohan
Name: Kevin
Name: Sundar


Inferring Schema | Specifying Schema
:--- | :---
Convert RDD to list of **Rows** | Convert RDD to list of **tuples**
Need not prepare schema | Define schema string and prepare **StructType**
Call `createDataFrame` **without schema** | Call `createDataFrame` **with schema**

In [26]:
# terminate pyspark session
spark.stop()