## Content
1. cashing dataframes

In [3]:
import findspark

findspark.init()

<IPython.core.display.Javascript object>

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()
print(spark)

<pyspark.sql.session.SparkSession object at 0x000001F666C58580>


<IPython.core.display.Javascript object>

### Reading Data
You'll be working with a new dataset consisting of airline departure information. It may have repetitive data and will need to be de-duplicated.

In [12]:
# The effect of the code in this cell is also not obvious beacurse I'm working locally !
import time

departures_df = spark.read.csv("datasets/AA_DFW_2015_Departures_Short.csv" , header=True , inferSchema=True)

start_time = time.time()

# Add caching to the unique rows in departures_df
departures_df = departures_df.distinct().cache()

# Count the unique rows in departures_df, noting how long the operation takes
print("Counting %d rows took %f seconds" % (departures_df.count(), time.time() - start_time))

# Count the rows again, noting the variance in time of a cached DataFrame
start_time = time.time()
print("Counting %d rows again took %f seconds" % (departures_df.count(), time.time() - start_time))


Counting 157198 rows took 1.310269 seconds
Counting 157198 rows again took 1.774900 seconds


<IPython.core.display.Javascript object>

### Removing data form cache using unpersist()

In [14]:
# Determine if departures_df is in the cache
print("Is departures_df cached?: %s" % departures_df.is_cached)
print("Removing departures_df from cache")

# Remove departures_df from the cache
departures_df.unpersist()

# Check the cache status again
print("Is departures_df cached?: %s" % departures_df.is_cached)


Is departures_df cached?: True
Removing departures_df from cache
Is departures_df cached?: False


<IPython.core.display.Javascript object>

### Quick pipelines

In [16]:
# Remove any duration of 0
departures_df = departures_df.filter(departures_df[3] > 0)

# Add an ID column
departures_df = departures_df.withColumn('id', monotonically_increasing_id())

# Write the file out to JSON format
departures_df.write.json('output.json', mode='overwrite')


<IPython.core.display.Javascript object>