In [1]:
sc

<pyspark.context.SparkContext at 0x1094d5fd0>

In [2]:
# set up the file paths for your data
airlinesPath='file:///Users/yiyingwang/desktop/ApacheSpark/udemy/data/airlines.csv'
airportsPath='file:///Users/yiyingwang/desktop/ApacheSpark/udemy/data/airports.csv'
flightsPath='file:///Users/yiyingwang/desktop/ApacheSpark/udemy/data/flights.csv'

In [None]:
airlines=sc.textFile(airlinesPath)

In [None]:
# airlines is an RDD
print airlines

In [None]:
# use the collect operation
airlines.collect()

In [None]:
airlines.first()

In [None]:
airlines.take(10)

In [None]:
airlines.count()

In [None]:
# filter out the header 
airlinesWoHeader = airlines.filter(lambda x: "Description" not in x)

In [None]:
print airlinesWoHeader

In [None]:
airlinesWoHeader.take(10)

In [None]:
airlinesPared=airlinesWoHeader.map(lambda x: x.split(",")).take(10)

In [None]:
airlinesPared

In [None]:
airlines.map(len).take(10)

In [None]:
def notHeader(row):
    return "Description" not in row

In [None]:
airlines.filter(notHeader).take(10)

In [None]:
# chain transformation together
airlines.filter(notHeader) \
    .map(lambda x: x.split(",")) \
    .take(10)


In [None]:
# use python libraries
import csv
from StringIO import StringIO

def split(line):
    reader = csv.reader(StringIO(line))
    return reader.next()

airlines.filter(notHeader).map(split).take(10)

In [3]:
flights=sc.textFile(flightsPath)

In [None]:
flights.count()

In [None]:
flights.take(10)
# flight date, airline code, flight num, source airport, destination airport, departure time, departure delay,
# arrival time, arrival delay, airtime, distance

In [None]:
# parse each row into a list
flightsParsed = flights.map(lambda x: x.split(','))

In [4]:
# set things up to reference these columns by name
# convert these fields to relevant data types from string
# set up a class to represent 1 record
# convert each list in flightsParsed to this class

from datetime import datetime
from collections import namedtuple

fields = ('date', 'airline', 'flightnum', 'origin', 'dest', 'dep', 'dep_delay',
         'arv', 'arv_delay', 'airtime', 'distance')
Flight = namedtuple('Flight', fields, verbose=True)  # use namedtuple to manufacture a class, factory functions 
DATE_FMT = "%Y-%m-%d"
TIME_FMT = "%H%M"

def parse(row): # parse the row list and return a Flight object
    row[0] = datetime.strptime(row[0], DATE_FMT).date()
    row[5] = datetime.strptime(row[5], TIME_FMT).time()
    row[6] = float(row[6])
    row[7] = datetime.strptime(row[7], TIME_FMT).time()
    row[8] = float(row[8])
    row[9] = float(row[9])
    row[10] = float(row[10])
    return Flight(*row[:11])

class Flight(tuple):
    'Flight(date, airline, flightnum, origin, dest, dep, dep_delay, arv, arv_delay, airtime, distance)'

    __slots__ = ()

    _fields = ('date', 'airline', 'flightnum', 'origin', 'dest', 'dep', 'dep_delay', 'arv', 'arv_delay', 'airtime', 'distance')

    def __new__(_cls, date, airline, flightnum, origin, dest, dep, dep_delay, arv, arv_delay, airtime, distance):
        'Create new instance of Flight(date, airline, flightnum, origin, dest, dep, dep_delay, arv, arv_delay, airtime, distance)'
        return _tuple.__new__(_cls, (date, airline, flightnum, origin, dest, dep, dep_delay, arv, arv_delay, airtime, distance))

    @classmethod
    def _make(cls, iterable, new=tuple.__new__, len=len):
        'Make a new Flight object from a sequence or iterable'
        result = new(cls, iterable)
        if len(result) != 11:
            raise TypeError('Expected 11 arguments, got %d' % len(result))
        return result

    def __repr__(self):
        'Return a nicely

In [5]:
# process each row in the dataset
flightsParsed = flights.map(lambda x: x.split(",")).map(parse) # a copy of the function is sent to each node
# such functions are called closure functions. Spark is built on Scala, which supports the use of closure functions

In [None]:
flightsParsed.first()

In [None]:
# we can access the values in the Flight object using the field name
flightsParsed.map(lambda x: x.distance).first()

# Compute the average distance travelled by a flight

In [None]:
# compute the total distance travelled by all flights 
totalDistance = flightsParsed.map(lambda x: x.distance).reduce(lambda x,y: x+y)
# reduce takes a function that acts on two elements and returns an object of the same type
avgDistance = totalDistance/flightsParsed.count()
print avgDistance

# Compute the % of flights which had delays

In [None]:
# counting the number of flights with delays
flightsParsed.filter(lambda x: x.dep_delay>0).count()/float(flightsParsed.count())

In [6]:
# most of computations need flightsParsed RDD
flightsParsed.persist()
# flightsParsed.unpersist()

PythonRDD[2] at RDD at PythonRDD.scala:43