In [0]:

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._

In [1]:
val ticketDataFilePath = "/user/avk3358_nyu_edu/project/data/ticket-df.parquet"
val matchDataFilePath = "/user/avk3358_nyu_edu/project/data/cleaned-event-df.parquet"
val airbnbDataFilePath = "/user/mjd9571_nyu_edu/project/airbnb_listings_cleaned.csv"
val flightDataFilePath = "/user/zc2398_nyu_edu/flight-data-cleaned/"
val hotelDataFilePath = "/user/yl12081_nyu_edu/hotel_data_clean_refined.parquet"

val matchDF = spark.read.parquet(matchDataFilePath)
val ticketDF = spark.read.parquet(ticketDataFilePath)
val airbnbDF = spark.read.option("header","true").csv(airbnbDataFilePath)
val flightDF = spark.read.parquet(flightDataFilePath)
val hotelDF = spark.read.parquet(hotelDataFilePath)

In [2]:

def getMatchesByTeams(teams: Array[String], matchesDF: org.apache.spark.sql.DataFrame): org.apache.spark.sql.DataFrame = {
    matchesDF
      .filter(array_contains($"teams", teams(0)) && array_contains($"teams", teams(1)))
}

In [3]:

val teams = Array("Real Madrid", "Bayern Munich")

val filteredMatchesDF = getMatchesByTeams(teams, matchDF)

z.show(filteredMatchesDF)

In [4]:

val joinedDF = ticketDF
  .join(filteredMatchesDF, filteredMatchesDF("match_id") === ticketDF("match_id"), "inner")
  .select(filteredMatchesDF("date"), filteredMatchesDF("teams"), ticketDF("match_id"), ticketDF("ticket_price"))

joinedDF.count()

In [5]:

def getMatchDataByTeams(teams: Array[String], matchDF: org.apache.spark.sql.DataFrame, ticketDF: org.apache.spark.sql.DataFrame): org.apache.spark.sql.DataFrame = {
    val matchData = matchDF
      .filter(array_contains($"teams", teams(0)) && array_contains($"teams", teams(1)))
      
    val allDF = ticketDF
      .join(matchData, matchData("match_id") === ticketDF("match_id"), "inner")
      .select(ticketDF("match_id"), matchData("date"), matchData("time"), matchData("city"), matchData("teams"), ticketDF("ticket_price"))
      
    allDF.groupBy($"date", $"teams", $"match_id", $"time", $"city").agg(min("ticket_price").alias("lowest_ticket_price"))
}

In [6]:

val matchCleaned = getMatchDataByTeams(teams, matchDF, ticketDF)
z.show(matchCleaned)

In [7]:


def getAverageFlightPriceDF(inDF:org.apache.spark.sql.DataFrame, flightDF:org.apache.spark.sql.DataFrame) = {
    val averagePrice = inDF.groupBy("arrival_datetime", "airline","origin").agg(round(avg("price"),2).as("avg_price"))
    val all = inDF.join(averagePrice, Seq("arrival_datetime", "airline","origin"),"left").drop("price").withColumnRenamed("avg_price","price")
    // println(all.columns.toList)
    
    val allColumnsExceptID = all.columns.filter(_ != "id").toList
    all.groupBy(allColumnsExceptID.map(col): _*).agg(max("id").as("latest_id"))  // Convert column names to Column type and spread them as arguments
}


def getFlightDataFromMatch(matchData: org.apache.spark.sql.DataFrame) = {
    val inboundFlightRaw = flightDF.join(matchData, lower(flightDF("destination_city"))===matchData("city") && flightDF("arrival_datetime") < matchData("time").minus(expr("interval 5 hours")) && flightDF("arrival_datetime") > matchData("time").minus(expr("interval 3 days"))).select("arrival_datetime", "airline","id", "price", "city","match_id","teams","origin_city","lowest_ticket_price").withColumnRenamed("origin_city","origin")
    val inboundFlight = getAverageFlightPriceDF(inboundFlightRaw, flightDF).withColumnRenamed("latest_id","arrival_flight_id").withColumnRenamed("price","in_price")
    
    // inboundFlight
    val outboundFlightRaw = flightDF.join(matchData, lower(flightDF("origin_city"))===matchData("city") && flightDF("arrival_datetime") > matchData("time").plus(expr("interval 5 hours")) && flightDF("arrival_datetime") < matchData("time").plus(expr("interval 3 days"))).select("departure_datetime", "arrival_datetime","airline","id", "price", "match_id","destination_city","origin_city").withColumnRenamed("destination_city","return_city").withColumnRenamed("origin_city","origin")
    
    val outboundFlight = getAverageFlightPriceDF(outboundFlightRaw, flightDF).withColumnRenamed("latest_id","departure_flight_id").drop("arrival_datetime","origin").withColumnRenamed("airline","airline_out").withColumnRenamed("price","out_price")
    
    
    inboundFlight.join(outboundFlight, inboundFlight("match_id")===outboundFlight("match_id") && inboundFlight("origin")===outboundFlight("return_city"), "cross").withColumn("flight_price", col("in_price")+col("out_price")).drop("in_price","out_price","return_city").withColumnRenamed("arrival_datetime","arrival_time").withColumnRenamed("departure_datetime","departure_time").drop(col("outboundFlight.match_id")).withColumn("arrival_date", $"arrival_time".cast("date"))
        .withColumn("departure_date", $"departure_time".cast("date"))
}


In [8]:

val flightOutput = getFlightDataFromMatch(matchCleaned)
z.show(flightOutput)

In [9]:
import org.apache.spark.sql.types.DoubleType
def flightJoinWithAirbnb(flightOutput: DataFrame, airbnbDF: DataFrame): DataFrame = {
  val result = flightOutput.join(
      airbnbDF.alias("a"),
      flightOutput("city") === lower(airbnbDF("city")) &&
      flightOutput("arrival_date") === airbnbDF("checkin_date") &&
      (flightOutput("departure_date") === airbnbDF("checkout_date") ||
       flightOutput("departure_date") === date_add(airbnbDF("checkout_date"), 1))
    ).drop(col("a.city")).drop(col("a.id")).drop(col("a.listing")).drop(col("a.desc")).drop(col("a.country")).drop(col("a.info_date"))
    
    result.withColumn("hotel_price", col("price").cast("double"))
          .withColumnRenamed("checkin_date", "check_in_date")
          .withColumnRenamed("checkout_date", "check_out_date")
          .withColumnRenamed("unique_id", "hotel_id")
          .drop("price")
}

In [10]:
def flightJoinWithHotel(flight: DataFrame, hotel: DataFrame): DataFrame = {
  val result = flight.alias("df1")
    .join(
      hotel.alias("df2"),
      lower(col("df1.city")) === lower(col("df2.city")) && 
      col("df2.check_in_date").geq(col("df1.arrival_date")) &&
      col("df2.check_in_date").leq(date_add(col("df1.arrival_date"), 1)) && 
      col("df2.check_out_date") === col("df1.departure_date"),
      "inner" 
    ).drop(col("df2.city"))
    
    
  result.drop("country","hotel_address","avg_review","square_feet","hotel_info_date","hotel_stay_duration","hotel_name")

}

In [11]:
val airbnbOutput = flightJoinWithAirbnb(flightOutput, airbnbDF)
z.show(airbnbOutput)

In [12]:
val agodaOutput = flightJoinWithHotel(flightOutput, hotelDF)
z.show(agodaOutput)