In [0]:
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._

val ticketDataFilePath = "/user/avk3358_nyu_edu/project/data/ticket-df.parquet"
val matchDataFilePath = "/user/avk3358_nyu_edu/project/data/cleaned-event-df.parquet"
val airbnbDataFilePath = "/user/mjd9571_nyu_edu/project/airbnb_listings_cleaned.csv"
val flightDataFilePath = "/user/zc2398_nyu_edu/flight-data-cleaned/"
val hotelDataFilePath = "/user/yl12081_nyu_edu/hotel_data_clean_refined.parquet"

val matchDF = spark.read.parquet(matchDataFilePath)
val ticketDF = spark.read.parquet(ticketDataFilePath)
val airbnbDF = spark.read.option("header","true").csv(airbnbDataFilePath).na.drop()
val flightDF = spark.read.parquet(flightDataFilePath)
val hotelDF = spark.read.parquet(hotelDataFilePath)

In [1]:
def getMatchesByTeams(teams: Array[String], matchesDF: org.apache.spark.sql.DataFrame): org.apache.spark.sql.DataFrame = {
    matchesDF
      .filter(array_contains($"teams", teams(0)) && array_contains($"teams", teams(1)))
}


def getMatchDataByTeams(teams: Array[String], matchDF: org.apache.spark.sql.DataFrame, ticketDF: org.apache.spark.sql.DataFrame): org.apache.spark.sql.DataFrame = {
    val matchData = matchDF
      .filter(array_contains($"teams", teams(0)) && array_contains($"teams", teams(1)))

      
    val allDF = ticketDF
      .join(matchData, matchData("match_id") === ticketDF("match_id"), "inner")
      .select(ticketDF("match_id"), matchData("date"), matchData("time"), matchData("city"), matchData("teams"), ticketDF("ticket_price"))
      
    allDF.groupBy($"date", $"teams", $"match_id", $"time", $"city").agg(min("ticket_price").alias("lowest_ticket_price"))
}


def getAverageFlightPriceDF(inDF:org.apache.spark.sql.DataFrame, flightDF:org.apache.spark.sql.DataFrame) = {
    val averagePrice = inDF.groupBy("arrival_datetime", "airline","origin").agg(round(avg("price"),2).as("avg_price"))
    val all = inDF.join(averagePrice, Seq("arrival_datetime", "airline","origin"),"left").drop("price").withColumnRenamed("avg_price","price")
    // println(all.columns.toList)
    
    val allColumnsExceptID = all.columns.filter(_ != "id").toList
    all.groupBy(allColumnsExceptID.map(col): _*).agg(max("id").as("latest_id"))  // Convert column names to Column type and spread them as arguments
}


def getFlightDataFromMatch(matchData: org.apache.spark.sql.DataFrame) = {
    val inboundFlightRaw = flightDF.join(matchData, lower(flightDF("destination_city"))===matchData("city") && flightDF("arrival_datetime") < matchData("time").minus(expr("interval 5 hours")) && flightDF("arrival_datetime") > matchData("time").minus(expr("interval 3 days"))).select("arrival_datetime", "airline","id", "price", "city","match_id","teams","origin_city","lowest_ticket_price").withColumnRenamed("origin_city","origin")
    val inboundFlight = getAverageFlightPriceDF(inboundFlightRaw, flightDF).withColumnRenamed("latest_id","arrival_flight_id").withColumnRenamed("price","in_price")
    
    // inboundFlight
    val outboundFlightRaw = flightDF.join(matchData, lower(flightDF("origin_city"))===matchData("city") && flightDF("arrival_datetime") > matchData("time").plus(expr("interval 5 hours")) && flightDF("arrival_datetime") < matchData("time").plus(expr("interval 3 days"))).select("departure_datetime", "arrival_datetime","airline","id", "price", "match_id","destination_city","origin_city").withColumnRenamed("destination_city","return_city").withColumnRenamed("origin_city","origin")
    
    val outboundFlight = getAverageFlightPriceDF(outboundFlightRaw, flightDF).withColumnRenamed("latest_id","departure_flight_id").drop("arrival_datetime","origin").withColumnRenamed("airline","airline_out").withColumnRenamed("price","out_price")
    
    
    inboundFlight.join(outboundFlight, inboundFlight("match_id")===outboundFlight("match_id") && inboundFlight("origin")===outboundFlight("return_city"), "cross").withColumn("flight_price", col("in_price")+col("out_price")).drop("in_price","out_price","return_city").withColumnRenamed("arrival_datetime","arrival_time").withColumnRenamed("departure_datetime","departure_time").drop(col("outboundFlight.match_id")).withColumn("arrival_date", $"arrival_time".cast("date"))
        .withColumn("departure_date", $"departure_time".cast("date"))
        .drop(outboundFlight("match_id"))
}


import org.apache.spark.sql.types.DoubleType
def flightJoinWithAirbnb(flightOutput: DataFrame, airbnbDF: DataFrame): DataFrame = {
  val result = flightOutput.join(
      airbnbDF.alias("a"),
      flightOutput("city") === lower(airbnbDF("city")) &&
      flightOutput("arrival_date") === airbnbDF("checkin_date") &&
      (flightOutput("departure_date") === airbnbDF("checkout_date") ||
       flightOutput("departure_date") === date_add(airbnbDF("checkout_date"), 1))
    ).drop(col("a.city")).drop(col("a.id")).drop(col("a.listing")).drop(col("a.desc")).drop(col("a.country")).drop(col("a.info_date")).drop(col("flightOutput.match_id"))
    
    result.withColumn("hotel_price", col("price").cast("double"))
          .withColumnRenamed("unique_id", "hotel_id")
          .withColumnRenamed("checkin_date", "check_in_date")
          .withColumnRenamed("checkout_date", "check_out_date")
          .drop("price")
          .withColumn("checkin_date_trimmed", trim($"check_in_date"))
          .withColumn("check_in_date", to_date($"checkin_date_trimmed", "yyyy-MM-dd"))
          .drop("checkin_date_trimmed")
          .withColumn("checkout_date_trimmed", trim($"check_out_date"))
          .withColumn("check_out_date", to_date($"checkout_date_trimmed", "yyyy-MM-dd"))
          .drop("checkout_date_trimmed")
}


def flightJoinWithHotel(flight: DataFrame, hotel: DataFrame): DataFrame = {
  val result = flight.alias("df1")
    .join(
      hotel.alias("df2"),
      lower(col("df1.city")) === lower(col("df2.city")) && 
      col("df2.check_in_date").geq(col("df1.arrival_date")) &&
      col("df2.check_in_date").leq(date_add(col("df1.arrival_date"), 1)) && 
      col("df2.check_out_date") === col("df1.departure_date"),
      "inner" 
    ).drop(col("df2.city"))
    
    
  result.drop("country","hotel_address","avg_review","square_feet","hotel_info_date","hotel_stay_duration","hotel_name")

}

In [2]:
val teams = Array("Real Madrid", "Bayern Munich")
val matchCleaned = getMatchDataByTeams(teams, matchDF, ticketDF)
val flightOutput = getFlightDataFromMatch(matchCleaned)
val airbnbOutput = flightJoinWithAirbnb(flightOutput, airbnbDF)
val agodaOutput = flightJoinWithHotel(flightOutput, hotelDF)

In [3]:
def avgHotelPricePerCity(airbnbDF: DataFrame, hotelDF: DataFrame): DataFrame = {
    val df1 = hotelDF.select(col("city"),col("hotel_price"))
    val df2 = airbnbDF.select(
          col("city"),
          col("price").cast("double").alias("hotel_price")
        )
    val commonCities = df1.select("city").distinct().intersect(df2.select("city").distinct())
    val mergedDF = df1.join(commonCities, "city").union(df2.join(commonCities, "city"))
    val avgPriceDF = mergedDF.groupBy("city")
          .agg(avg("hotel_price").alias("avg_price"))
          .orderBy("avg_price")  
    avgPriceDF
}

In [4]:
def avgHotelPricePerCity(airbnbDF: DataFrame, hotelDF: DataFrame): DataFrame = {
    val df1 = hotelDF.select(col("city"),col("hotel_price"))
    val df2 = airbnbDF.select(
          col("city"),
          col("price").cast("double").alias("hotel_price")
        )
    val commonCities = df1.select("city").distinct().intersect(df2.select("city").distinct())
    val mergedDF = df1.join(commonCities, "city").union(df2.join(commonCities, "city"))
    val avgPriceDF = mergedDF.groupBy("city")
          .agg(avg("hotel_price").alias("avg_price"))
          .orderBy("avg_price")  
    avgPriceDF
}

In [5]:
val df = avgHotelPricePerCity(airbnbDF, hotelDF)
z.show(df)

In [6]:
def bestTimetoLeave(airbnbOutput: DataFrame, agodaOutput: DataFrame): DataFrame = {
    val df1 = agodaOutput.withColumn("total_price", col("hotel_price") + col("flight_price")+col("lowest_ticket_price")).select(
        col("total_price"),
        datediff(col("check_out_date"), col("arrival_date")).alias("when_to_leave_affter_arrival")
    )
    val df2 = airbnbOutput.withColumn("total_price", col("hotel_price") + col("flight_price")+col("lowest_ticket_price")).select(
            col("total_price"),
            datediff(col("check_out_date"), col("arrival_date")).alias("when_to_leave_affter_arrival")
        )
    val merge = df1.union(df2)
    val best = merge.groupBy("when_to_leave_affter_arrival").agg(avg("total_price").alias("avg_total_price")).orderBy("avg_total_price")
    best
}

In [7]:
val df =  bestTimetoLeave(airbnbOutput, agodaOutput)
z.show(df)

In [8]:
def cheapestWeekDayToCheckIn(airbnbDF: DataFrame, hotelDF: DataFrame): DataFrame = {
    val df1 = hotelDF.withColumn("day_of_week", date_format(col("check_in_date"), "EEEE")).select(col("day_of_week"),col("hotel_price"))
    val df2 = airbnbDF.withColumn("checkin_date_trimmed", trim($"checkin_date"))
      .withColumn("check_in_date", to_date($"checkin_date_trimmed", "yyyy-MM-dd"))
      .withColumn("day_of_week", date_format(col("check_in_date"), "EEEE"))
      .select(col("day_of_week"),col("price").cast("double").alias("hotel_price"))
    val merge = df1.union(df2).filter(col("day_of_week").isin("Monday", "Tuesday", "Wednesday", "Thursday")) 
    val result = merge.groupBy("day_of_week").agg(avg("hotel_price").alias("avg_hotel_price")).orderBy("avg_hotel_price")
    result
}



In [9]:
val df = cheapestWeekDayToCheckIn(airbnbDF,hotelDF)
z.show(df)

In [10]:
def cheapestWeekDayToCheckOut(airbnbDF: DataFrame, hotelDF: DataFrame): DataFrame = {
    val df1 = hotelDF.withColumn("day_of_week", date_format(col("check_out_date"), "EEEE")).select(col("day_of_week"),col("hotel_price"))
    val df2 = airbnbDF.withColumn("checkout_date_trimmed", trim($"checkout_date"))
      .withColumn("check_out_date", to_date($"checkout_date_trimmed", "yyyy-MM-dd"))
      .withColumn("day_of_week", date_format(col("check_out_date"), "EEEE"))
      .select(col("day_of_week"),col("price").cast("double").alias("hotel_price"))
    val merge = df1.union(df2).filter(col("day_of_week").isin("Friday", "Saturday", "Sunday")) 
    val result = merge.groupBy("day_of_week").agg(avg("hotel_price").alias("avg_hotel_price")).orderBy("avg_hotel_price")
    result
}


In [11]:
val df = cheapestWeekDayToCheckOut(airbnbDF,hotelDF)
z.show(df)

In [12]:
def hotelPriceTrend(airbnbDF: DataFrame, hotelDF: DataFrame): DataFrame = {
    val df1 = hotelDF.select(col("city"),col("check_out_date"),col("hotel_price"))
    val df2 = airbnbDF.withColumn("checkout_date_trimmed", trim($"checkout_date"))
      .withColumn("check_out_date", to_date($"checkout_date_trimmed", "yyyy-MM-dd"))
      .select(col("city"),col("check_out_date"),col("price").cast("double").alias("hotel_price"))
    val merge = df1.union(df2).filter(col("check_out_date").isNotNull)
    val result = merge.groupBy("check_out_date").agg(avg("hotel_price").alias("avg_hotel_price")).orderBy("avg_hotel_price")
    result
}


In [13]:
val df = hotelPriceTrend(airbnbDF,hotelDF)
z.show(df)

In [14]:
//val test = airbnbDF.orderBy("city")
z.show(airbnbDF)

In [15]:
z.show(airbnbDF.na.drop().orderBy("city"))

In [16]:
def calculateAverageHotelPrice(hotelDF: DataFrame): DataFrame = {
  hotelDF.groupBy("city")
    .agg(avg("hotel_price").alias("avg_price"))
    .orderBy("city")
}
def calculateAverageAirbnbPrice(airbnbDF: DataFrame): DataFrame = {
  airbnbDF.na.drop().groupBy("city")
    .agg(avg("price").alias("avg_price"))
    .orderBy("city")
}
val hotelAvgPrice = calculateAverageHotelPrice(hotelDF)
val airbnbAvgPrice = calculateAverageAirbnbPrice(airbnbDF)

println("Average Hotel Prices by City:")
z.show(hotelAvgPrice)
println("Average Airbnb Prices by City:")
z.show(airbnbAvgPrice)

In [17]:
val hotelAvgPrice = hotelDF.groupBy("city")
  .agg(avg("hotel_price").alias("avg_hotel_price"))
  .withColumnRenamed("city", "hotel_city")
val airbnbAvgPrice = airbnbDF.na.drop().groupBy("city")
  .agg(avg("price").alias("avg_airbnb_price"))
  .withColumnRenamed("city", "airbnb_city")
val comparisonDF = hotelAvgPrice.join(airbnbAvgPrice, hotelAvgPrice("hotel_city") === airbnbAvgPrice("airbnb_city"))
  .select(col("hotel_city").alias("city"), col("avg_hotel_price"), col("avg_airbnb_price"))

println("Average Prices Comparison by City:")
z.show(comparisonDF)