In [1]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import lit
from pyspark.sql.types import StructType, StructField, StringType, FloatType, BooleanType
spark = SparkSession\
  .builder\
  .master("local[2]")\
  .appName("SDG_Chapter10")\
  .getOrCreate()

In [2]:
df = spark.read.json(
  "/home/jagadeesh/git/Spark-The-Definitive-Guide/data/flight-data/json/2015-summary.json")

In [3]:
df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [4]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [5]:
from pyspark.sql.types import LongType
flightSchema = StructType([
  StructField("dest", StringType(), True),
  StructField("origin", StringType(), True),
  StructField("count", LongType(), True),
])

In [6]:
df1 = spark.createDataFrame(df.rdd, schema=flightSchema)

In [7]:
df1.show()

+--------------------+----------------+-----+
|                dest|          origin|count|
+--------------------+----------------+-----+
|       United States|         Romania|   15|
|       United States|         Croatia|    1|
|       United States|         Ireland|  344|
|               Egypt|   United States|   15|
|       United States|           India|   62|
|       United States|       Singapore|    1|
|       United States|         Grenada|   62|
|          Costa Rica|   United States|  588|
|             Senegal|   United States|   40|
|             Moldova|   United States|    1|
|       United States|    Sint Maarten|  325|
|       United States|Marshall Islands|   39|
|              Guyana|   United States|   64|
|               Malta|   United States|    1|
|            Anguilla|   United States|   41|
|             Bolivia|   United States|   30|
|       United States|        Paraguay|    6|
|             Algeria|   United States|    4|
|Turks and Caicos ...|   United St

In [8]:
flightSchemaRectified = StructType([
  StructField("dest", StringType(), True),
  StructField("origin", StringType(), True),
  StructField("count", StringType(), True),
])

In [14]:
df1 = spark.createDataFrame(df.rdd, schema=flightSchemaRectified)

In [15]:
type(df1.columns)

list

In [16]:
df1.printSchema()

root
 |-- dest: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- count: string (nullable = true)



In [11]:
df1.show()

+--------------------+----------------+-----+
|                dest|          origin|count|
+--------------------+----------------+-----+
|       United States|         Romania|   15|
|       United States|         Croatia|    1|
|       United States|         Ireland|  344|
|               Egypt|   United States|   15|
|       United States|           India|   62|
|       United States|       Singapore|    1|
|       United States|         Grenada|   62|
|          Costa Rica|   United States|  588|
|             Senegal|   United States|   40|
|             Moldova|   United States|    1|
|       United States|    Sint Maarten|  325|
|       United States|Marshall Islands|   39|
|              Guyana|   United States|   64|
|               Malta|   United States|    1|
|            Anguilla|   United States|   41|
|             Bolivia|   United States|   30|
|       United States|        Paraguay|    6|
|             Algeria|   United States|    4|
|Turks and Caicos ...|   United St

In [12]:
df.collect()[1]

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1)

In [13]:
df.rdd.collect()[1]

Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1)

In [17]:
df2  = spark.createDataFrame(df.rdd.collect()[0],  StringType())

In [18]:
df2

DataFrame[value: string]

In [19]:
df2.show()

+-------------+
|        value|
+-------------+
|United States|
|      Romania|
|           15|
+-------------+



In [39]:
from pyspark.sql import Row
from pyspark.sql.types import MapType

In [33]:
a =  Row('Jagadeesh', 'Jithendar')

In [34]:
a

<Row(Jagadeesh, Jithendar)>

In [35]:
a[0]

'Jagadeesh'

In [36]:
a[1]

'Jithendar'

In [37]:
b = Row(['Jagadeesh', 'Jithendra'])

In [38]:
b

TypeError: sequence item 0: expected str instance, list found