In [0]:
df = spark.read.json('dbfs:/FileStore/flight_data/*.json')
print(df.schema)

StructType([StructField('DEST_COUNTRY_NAME', StringType(), True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', LongType(), True)])


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, LongType
myManualSchema = StructType([StructField('DEST_COUNTRY_NAME',StringType(),True), StructField('ORIGIN_COUNTRY_NAME', StringType(), True), StructField('count', LongType(), True, metadata={'count':'contains the number of flights'})])

In [0]:
df = spark.read.format("json").schema(myManualSchema)\
.load("dbfs:/FileStore/flight_data/*.json")

In [0]:
df.createOrReplaceTempView('dfTable')

In [0]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [0]:
df.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import col
df["count"] + 2

Out[7]: Column<'(count + 2)'>

In [0]:
from pyspark.sql.functions import expr
df.selectExpr("*","(count + 2) as add2").show(5)

+-----------------+-------------------+-----+----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|add2|
+-----------------+-------------------+-----+----+
|    United States|            Romania|   15|  17|
|    United States|            Croatia|    1|   3|
|    United States|            Ireland|  344| 346|
|            Egypt|      United States|   15|  17|
|    United States|              India|   62|  64|
+-----------------+-------------------+-----+----+
only showing top 5 rows



In [0]:
df.columns

Out[9]: ['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [0]:
df.select("DEST_COUNTRY_NAME").show(2)


+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



In [0]:
%sql
SELECT DEST_COUNTRY_NAME FROM dfTable LIMIT 2

DEST_COUNTRY_NAME
United States
United States


In [0]:
from pyspark.sql.functions import expr, col, column
df.select(
expr("DEST_COUNTRY_NAME"),
col("DEST_COUNTRY_NAME"),
column("DEST_COUNTRY_NAME"))\
.show(2)

+-----------------+-----------------+-----------------+
|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|
+-----------------+-----------------+-----------------+
|    United States|    United States|    United States|
|    United States|    United States|    United States|
+-----------------+-----------------+-----------------+
only showing top 2 rows



In [0]:
df.select(expr("DEST_COUNTRY_NAME AS destination")).show(2)

+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows



In [0]:
df.select(expr("DEST_COUNTRY_NAME as destination").alias("DEST_COUNTRY_NAME"))\
.show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



In [0]:
df.selectExpr("DEST_COUNTRY_NAME as newColumnName", "DEST_COUNTRY_NAME").show(2)

+-------------+-----------------+
|newColumnName|DEST_COUNTRY_NAME|
+-------------+-----------------+
|United States|    United States|
|United States|    United States|
+-------------+-----------------+
only showing top 2 rows



In [0]:
df.selectExpr(
"*", # all original columns
"(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry")\
.show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows



In [0]:
df.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show()

+------------------+---------------------------------+
|        avg(count)|count(DISTINCT DEST_COUNTRY_NAME)|
+------------------+---------------------------------+
|1718.3189081225032|                              167|
+------------------+---------------------------------+



In [0]:
%sql
SELECT *, (DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry
FROM dfTable
LIMIT 2

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count,withinCountry
United States,Romania,15,False
United States,Croatia,1,False


In [0]:
from pyspark.sql.functions import lit
df.select(expr('*'),lit(1).alias('one')).show(2)

+-----------------+-------------------+-----+---+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|one|
+-----------------+-------------------+-----+---+
|    United States|            Romania|   15|  1|
|    United States|            Croatia|    1|  1|
+-----------------+-------------------+-----+---+
only showing top 2 rows



In [0]:
%sql
SELECT *, 1 as One FROM dfTable LIMIT 2

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count,One
United States,Romania,15,1
United States,Croatia,1,1


In [0]:
df.withColumn("numberOne", lit(1)).show(2)

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|numberOne|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|        1|
|    United States|            Croatia|    1|        1|
+-----------------+-------------------+-----+---------+
only showing top 2 rows



In [0]:
df.withColumn("withinCountry", expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME"))\
.show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows



In [0]:
df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").columns

Out[23]: ['dest', 'ORIGIN_COUNTRY_NAME', 'count']

In [0]:
dfWithLongColName = df.withColumn(
"This Long Column-Name",
expr("ORIGIN_COUNTRY_NAME"))

In [0]:
#Use ` to escape special characters
dfWithLongColName.selectExpr(
"`This Long Column-Name`",
"`This Long Column-Name` as `new col`")\
.show(2)

+---------------------+-------+
|This Long Column-Name|new col|
+---------------------+-------+
|              Romania|Romania|
|              Croatia|Croatia|
+---------------------+-------+
only showing top 2 rows



In [0]:
%sql
-- set spark.sql.caseSensitive true

In [0]:
#dropping columns can be done by using df.SELECT, also,
df.drop("ORIGIN_COUNTRY_NAME").columns

Out[26]: ['DEST_COUNTRY_NAME', 'count']

In [0]:
dfWithLongColName.drop("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME")

Out[27]: DataFrame[count: bigint, This Long Column-Name: string]

In [0]:
#type casting
df.withColumn("count2", col("count").cast("long"))
#-- in SQL
#SELECT *, cast(count as long) AS count2 FROM dfTable

Out[28]: DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint, count2: bigint]

In [0]:
df.where('count < 2').show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [0]:
df.filter("count < 2").show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [0]:
df.filter(col('count')<2).show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [0]:
%sql
Select * from dfTable where count<2 limit 2

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Croatia,1
United States,Singapore,1


In [0]:
df.where(col("count") < 2).where(col("ORIGIN_COUNTRY_NAME") != "Croatia")\
.show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [0]:
df.where("(count < 2) or (ORIGIN_COUNTRY_NAME != 'Croatia')").show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [0]:
%sql
SELECT * FROM dfTable WHERE count < 2 AND ORIGIN_COUNTRY_NAME != "Croatia"
LIMIT 2

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Singapore,1
Moldova,United States,1


In [0]:
df.count()

Out[36]: 1502

In [0]:
#df.distinct() can be used to remove duplicates from a DataFrame
df.distinct().count()

Out[37]: 1328

In [0]:
df.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").distinct().count()

Out[38]: 320

In [0]:
%sql
Select count(distinct(ORIGIN_COUNTRY_NAME)) from dfTable

count(DISTINCT ORIGIN_COUNTRY_NAME)
154


In [0]:
#Random Samples
seed = 5
withReplacement = False
fraction = 0.5
df.sample(withReplacement, fraction, seed).count()

Out[40]: 756

In [0]:
#Random splits for dividing into training, test and validation sets
testDf,trainDf = df.randomSplit([0.25, 0.75], 7)
testDf.count() > trainDf.count()

Out[41]: False

In [0]:
from pyspark.sql import Row
schema = df.schema
newRows = [
Row("New Country", "Other Country", 5),
Row("New Country 2", "Other Country 3", 1)
]
parallelizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(parallelizedRows, schema)

In [0]:
df.union(newDF).where("(count = 1) or (count = 5)").where("(ORIGIN_COUNTRY_NAME = 'Other Country') or (ORIGIN_COUNTRY_NAME = 'Other Country 3')").show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|      New Country|      Other Country|    5|
|    New Country 2|    Other Country 3|    1|
+-----------------+-------------------+-----+



In [0]:
from pyspark.sql.functions import desc, asc, desc_nulls_first, desc_nulls_last, asc_nulls_first
df.orderBy(expr("count desc")).show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|               Togo|    1|
|            Malta|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [0]:
df.sort(col("count").desc()).show(2)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|      United States|358354|
+-----------------+-------------------+------+
only showing top 2 rows



In [0]:
df.orderBy(col("count").desc()).show(2)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|      United States|358354|
+-----------------+-------------------+------+
only showing top 2 rows



In [0]:
df.selectExpr("max(count)").show()

+----------+
|max(count)|
+----------+
|    370002|
+----------+



In [0]:
df.where("count = 370002").show()

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
+-----------------+-------------------+------+



In [0]:
df.orderBy(col("count").desc(), col("DEST_COUNTRY_NAME").asc()).show(5)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|      United States|358354|
|    United States|      United States|352742|
|    United States|      United States|348113|
|    United States|      United States|347452|
+-----------------+-------------------+------+
only showing top 5 rows



In [0]:
%sql
Select * FROM dfTable
ORDER by count desc,
DEST_COUNTRY_NAME asc
LIMIT 2

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,United States,370002
United States,United States,358354


In [0]:
df.sortWithinPartitions(col("count").desc()).show(5)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|      United States|348113|
|    United States|             Canada|  8483|
|           Canada|      United States|  8399|
|    United States|             Canada|  8305|
+-----------------+-------------------+------+
only showing top 5 rows



In [0]:
df.sort(col("count").desc()).show(5)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|      United States|358354|
|    United States|      United States|352742|
|    United States|      United States|348113|
|    United States|      United States|347452|
+-----------------+-------------------+------+
only showing top 5 rows



In [0]:
df.limit(6).show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+



In [0]:
df.rdd.getNumPartitions()

Out[73]: 3

In [0]:
x = df.repartition(5)

In [0]:
x.rdd.getNumPartitions()

Out[77]: 5

In [0]:
x = df.repartition(5,col("DEST_COUNTRY_NAME"))
x.rdd.getNumPartitions()

Out[79]: 5

In [0]:
x = df.repartition(5, col("DEST_COUNTRY_NAME")).coalesce(2)
x.rdd.getNumPartitions()

Out[80]: 2

In [0]:
'''collectDF = df.limit(10)
collectDF.take(5) # take works with an Integer count
collectDF.show() # this prints it out nicely
collectDF.show(5, False)
collectDF.collect()
collectDF.toLocalIterator()'''