# Analysing Air Traffic Data with Spark

#### Load Dataset

In [66]:
val flightsCSV = "datasets/flights.csv"
val flightsDS = sc.textFile(flightsCSV).filter(x => !x.contains("FL_DATE"))

In [67]:
// Check the header
flightsDS.take(2)

Array(2017-01-01,19805,14683,11057,"0819","1149",1095.00,, 2017-01-01,19805,14107,10423,"1025","1351",872.00,)

#### Create a Wrapper Class

In [76]:
import org.joda.time.format.DateTimeFormat

case class USACarrier(flightDate: LocalDate, airlineId: Int, originAirportId: Int, destAirportId: Int, departuretime: String, arrivalTime: String, distance: Double) extends java.io.Serializable

object USACarrierParser extends Serializable {

  def parse(row: String) = {
    val fields = row.split(",")
    
    val dateFormat = DateTimeFormat.forPattern("yyyy-mm-dd")
    
    val flightDate = dateFormat.parseDateTime(fields(0)).toLocalDate
    val airlineId = fields(1).toInt
    val originAirportId = fields(2).toInt
    val destAirportId = fields(3).toInt
    val departuretime = fields(4)
    val arrivalTime = fields(5)
    val distance = fields(6).toDouble
    
    USACarrier(flightDate, airlineId, originAirportId, destAirportId, departuretime, arrivalTime, distance)
  }
}


In [82]:
val flightsParsed = flightsDS.map(USACarrierParser.parse(_))
flightsParsed.first

USACarrier(2017-01-01,19805,14683,11057,"0819","1149",1095.0)

#### Calculate the Average Distance Travelled

In [84]:
val flightsDistance = flightsParsed.map(_.distance)
val totalDistance = flightsDistance.reduce((x, y) => x + y)
val averageDistance = totalDistance / flightsParsed.count

In [86]:
print(s"Average distance tavelled in January was $averageDistance miles")

Average distance tavelled in January was 852.125139716944 miles