In [0]:
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._

In [1]:
val ticketDataFilePath = "/user/avk3358_nyu_edu/project/data/ticket-df.parquet"
val matchDataFilePath = "/user/avk3358_nyu_edu/project/data/cleaned-event-df.parquet"
val airbnbDataDir = "/user/mjd9571_nyu_edu/project/airbnb_listings_cleaned.csv"
val flightDataDir = "/user/zc2398_nyu_edu/flight-data-cleaned/"
val hotelDataDir = "/user/yl12081_nyu_edu/hotel_data_clean_refined.parquet"

val matchDF = spark.read.parquet(matchDataFilePath)
val ticketDF = spark.read.parquet(ticketDataFilePath)
val airbnbDF = spark.read.option("header","true").csv(airbnbDataDir)
val flightDF = spark.read.parquet(flightDataDir)
val hotelDF = spark.read.parquet(hotelDataDir)

In [2]:
def getMatchDataByTeams(teams: Array[String], matchDF: DataFrame, ticketDF: DataFrame): org.apache.spark.sql.DataFrame = {
    val matchData = matchDF
      .filter(array_contains($"teams", teams(0)) && array_contains($"teams", teams(1)))
      
    val allDF = ticketDF
      .join(matchData, matchData("match_id") === ticketDF("match_id"), "inner")
      .select(ticketDF("match_id"), matchData("date"), matchData("time"), matchData("city"), matchData("teams"), ticketDF("ticket_price"))
      
    allDF.groupBy($"date", $"teams", $"match_id", $"time", $"city").agg(min("ticket_price").alias("lowest_ticket_price"))
}

In [3]:
def getAverageFlightPriceDF(
    inDF: org.apache.spark.sql.DataFrame,
    flightDF: org.apache.spark.sql.DataFrame
) = {
  val averagePrice = inDF
    .groupBy("arrival_datetime", "airline", "origin")
    .agg(round(avg("price"), 2).as("avg_price"))
  val all = inDF
    .join(averagePrice, Seq("arrival_datetime", "airline", "origin"), "left")
    .drop("price")
    .withColumnRenamed("avg_price", "price")
  // println(all.columns.toList)

  val allColumnsExceptID = all.columns.filter(_ != "id").toList
  all
    .groupBy(allColumnsExceptID.map(col): _*)
    .agg(
      max("id").as("latest_id")
    ) // Convert column names to Column type and spread them as arguments
}

In [4]:
def getFlightDataFromMatch(matchData: org.apache.spark.sql.DataFrame) = {
  val inboundFlightRaw = flightDF
    .join(
      matchData,
      lower(flightDF("destination_city")) === matchData("city") && flightDF(
        "arrival_datetime"
      ) < matchData("time").minus(expr("interval 5 hours")) && flightDF(
        "arrival_datetime"
      ) > matchData("time").minus(expr("interval 3 days"))
    )
    .select(
      "arrival_datetime",
      "airline",
      "id",
      "price",
      "city",
      "match_id",
      "teams",
      "origin_city",
      "lowest_ticket_price"
    )
    .withColumnRenamed("origin_city", "origin")
  val inboundFlight = getAverageFlightPriceDF(inboundFlightRaw, flightDF)
    .withColumnRenamed("latest_id", "arrival_flight_id")
    .withColumnRenamed("price", "in_price")

  // inboundFlight
  val outboundFlightRaw = flightDF
    .join(
      matchData,
      lower(flightDF("origin_city")) === matchData("city") && flightDF(
        "arrival_datetime"
      ) > matchData("time").plus(expr("interval 5 hours")) && flightDF(
        "arrival_datetime"
      ) < matchData("time").plus(expr("interval 3 days"))
    )
    .select(
      "departure_datetime",
      "arrival_datetime",
      "airline",
      "id",
      "price",
      "match_id",
      "destination_city",
      "origin_city"
    )
    .withColumnRenamed("destination_city", "return_city")
    .withColumnRenamed("origin_city", "origin")

  val outboundFlight = getAverageFlightPriceDF(outboundFlightRaw, flightDF)
    .withColumnRenamed("latest_id", "departure_flight_id")
    .drop("arrival_datetime", "origin")
    .withColumnRenamed("airline", "airline_out")
    .withColumnRenamed("price", "out_price")

  inboundFlight
    .join(
      outboundFlight,
      inboundFlight("match_id") === outboundFlight("match_id") && inboundFlight(
        "origin"
      ) === outboundFlight("return_city"),
      "cross"
    )
    .withColumn("flight_price", col("in_price") + col("out_price"))
    .drop("in_price", "out_price", "return_city")
    .withColumnRenamed("arrival_datetime", "arrival_time")
    .withColumnRenamed("departure_datetime", "departure_time")
}

In [5]:
// TODELETE
val matchCleaned = getMatchDataByTeams(teams, matchDF, ticketDF)
val matchFlightsDF = getFlightDataFromMatch(matchCleaned)

In [6]:
matchFlightsDF.printSchema

In [7]:
hotelDF.printSchema

In [8]:
hotelOutput.printSchema

In [9]:
def flightJoinWithHotel(flight: DataFrame, hotel: DataFrame): DataFrame = {
  val result = flight.alias("df1")
    .join(
      hotel.alias("df2"),
      lower(col("df1.city")) === lower(col("df2.city")) && // Case-insensitive city comparison
      col("df2.check_in_date").geq(col("df1.arrival_date")) && // checkinDate >= arrival_date
      col("df2.check_in_date").leq(date_add(col("df1.arrival_date"), 1)) && // checkinDate <= arrival_date + 1 day
      col("df2.check_out_date") === col("df1.departure_date"), // checkoutDate == departure_date
      "inner" // Inner join to only keep matching rows
    )
    
  result.withColumnRenamed("departure_date", "departure_time")
        .withColumnRenamed("arrival_date", "arrival_time")
}

In [10]:
def flightJoinWithAirbnb(flightDataDF: DataFrame, airbnbDF: DataFrame): DataFrame = {
    flightDataDF.join(
        airbnbDF,
        flightDataDF("city") === lower(airbnbDF("city")) &&
        flightDataDF("arrival_date") === airbnbDF("checkin_date") &&
        (flightDataDF("departure_date") === airbnbDF("checkout_date") ||
        flightDataDF("departure_date") === date_add(airbnbDF("checkout_date"), 1))
    )
}

val flightAirbnbJoinDF =

In [11]:
val teams = Array("Borussia Dortmund", "Paris Saint-Germain")

val filteredMatchesDF = getMatchesByTeams(teams, matchDF)

z.show(filteredMatchesDF)

In [12]:
def planItineraryFlightsAndHotels(teams: Array[String], matchDF: DataFrame, ticketDF: DataFrame, flightDF: DataFrame, hotelDF: DataFrame): DataFrame = {
    val filteredMatchesDF = getMatchDataByTeams(teams, matchDF, ticketDF)
    val flightsForMatchesDF = getFlightDataFromMatch(filteredMatchesDF)
    val convertedDF = flightsForMatchesDF.withColumn("arrival_date", to_date($"arrival_time"))
                                        .withColumn("departure_date", to_date($"departure_time"))
    flightJoinWithHotel(convertedDF, hotelDF).drop("country","hotel_address","avg_review","square_feet","hotel_info_date","hotel_stay_duration")
}

In [13]:
val hotelOutput = planItineraryFlightsAndHotels(teams, matchDF, ticketDF, flightDF, hotelDF)
z.show(hotelOutput)

In [14]:
def planItineraryFlightsAndAirbnb(teams: Array[String], matchDF: DataFrame, ticketDF: DataFrame, flightDF: DataFrame, airbnbDF: DataFrame): DataFrame = {
    val filteredMatchesDF = getMatchDataByTeams(teams, matchDF, ticketDF)
    val flightsForMatchesDF = getFlightDataFromMatch(filteredMatchesDF)
    val convertedDF = flightsForMatchesDF.withColumn("arrival_date", to_date($"arrival_time"))
                                        .withColumn("departure_date", to_date($"departure_time"))
    flightJoinWithAirbnb(convertedDF, airbnbDF).drop("listing")
}

In [15]:
val airbnbOutput = planItineraryFlightsAndAirbnb(teams, matchDF, ticketDF, flightDF, airbnbDF)
z.show(airbnbOutput)

In [16]:
airbnbOutput.printSchema