In [0]:
val matchDataFilePath = "/user/avk3358_nyu_edu/project/data/cleaned-event-df.parquet"
var matchDF = spark.read.parquet(matchDataFilePath)

In [1]:
z.show(matchDF)

In [2]:
matchDF.printSchema

In [3]:
val ticketDataFilePath = "/user/avk3358_nyu_edu/project/data/ticket-df.parquet"
var ticketDF = spark.read.parquet(ticketDataFilePath)

In [4]:
z.show(ticketDF)

In [5]:
ticketDF.printSchema

In [6]:
def getMatchesByTeams(teams: Array[String], matchesDF: org.apache.spark.sql.DataFrame): org.apache.spark.sql.DataFrame = {
    matchesDF
      .filter(array_contains($"teams", teams(0)) && array_contains($"teams", teams(1)))
}

In [7]:
val teams = Array("Real Madrid", "Bayern Munich")

val filteredMatchesDF = getMatchesByTeams(teams, matchDF)

z.show(filteredMatchesDF)

In [8]:
val joinedDF = ticketDF
  .join(filteredMatchesDF, filteredMatchesDF("match_id") === ticketDF("match_id"), "inner")
  .select(filteredMatchesDF("date"), filteredMatchesDF("teams"), ticketDF("match_id"), ticketDF("ticket_price"))

joinedDF.count()

In [9]:
z.show(joinedDF)

In [10]:
z.show(joinedDF.select($"match_id").distinct())

In [11]:
z.show(joinedDF.groupBy($"date", $"teams", $"match_id").agg(min("ticket_price").alias("lowest_ticket_price")))

In [12]:
def getMatchDataByTeams(teams: Array[String], matchDF: org.apache.spark.sql.DataFrame, ticketDF: org.apache.spark.sql.DataFrame): org.apache.spark.sql.DataFrame = {
    val matchData = matchDF
      .filter(array_contains($"teams", teams(0)) && array_contains($"teams", teams(1)))
      
    val allDF = ticketDF
      .join(matchData, matchData("match_id") === ticketDF("match_id"), "inner")
      .select(ticketDF("match_id"), matchData("date"), matchData("time"), matchData("city"), matchData("teams"), ticketDF("ticket_price"))
      
    allDF.groupBy($"date", $"teams", $"match_id", $"time", $"city").agg(min("ticket_price").alias("lowest_ticket_price"))
}

In [13]:
val matchCleaned = getMatchDataByTeams(teams, matchDF, ticketDF)
z.show(matchCleaned)

In [14]:
// part 2: flight data

val airbnbDataDir = "/user/mjd9571_nyu_edu/project/airbnb_listings_cleaned.csv"

val flightDataDir = "/user/zc2398_nyu_edu/flight-data-cleaned/"

val hotelDataDir = "/user/yl12081_nyu_edu/project/hotel_data_clean.parquet"


val airbnbDF = spark.read.option("header","true").csv(airbnbDataDir)
val flightDF = spark.read.parquet(flightDataDir)
val hotelDF = spark.read.parquet(hotelDataDir)

In [15]:
z.show(flightDF)

In [16]:
import org.apache.spark.sql.functions._

def getAverageFlightPriceDF(inDF:org.apache.spark.sql.DataFrame, flightDF:org.apache.spark.sql.DataFrame) = {
    val averagePrice = inDF.groupBy("arrival_datetime", "airline","origin").agg(round(avg("price"),2).as("avg_price"))
    val all = inDF.join(averagePrice, Seq("arrival_datetime", "airline","origin"),"left").drop("price").withColumnRenamed("avg_price","price")
    // println(all.columns.toList)
    
    val allColumnsExceptID = all.columns.filter(_ != "id").toList
    all.groupBy(allColumnsExceptID.map(col): _*).agg(max("id").as("latest_id"))  // Convert column names to Column type and spread them as arguments
}


def getFlightDataFromMatch(matchData: org.apache.spark.sql.DataFrame) = {
    val inboundFlightRaw = flightDF.join(matchData, lower(flightDF("destination_city"))===matchData("city") && flightDF("arrival_datetime") < matchData("time").minus(expr("interval 5 hours")) && flightDF("arrival_datetime") > matchData("time").minus(expr("interval 3 days"))).select("arrival_datetime", "airline","id", "price", "city","match_id","teams","origin_city","lowest_ticket_price").withColumnRenamed("origin_city","origin")
    val inboundFlight = getAverageFlightPriceDF(inboundFlightRaw, flightDF).withColumnRenamed("latest_id","arrival_flight_id").withColumnRenamed("price","in_price")
    
    // inboundFlight
    val outboundFlightRaw = flightDF.join(matchData, lower(flightDF("origin_city"))===matchData("city") && flightDF("arrival_datetime") > matchData("time").plus(expr("interval 5 hours")) && flightDF("arrival_datetime") < matchData("time").plus(expr("interval 3 days"))).select("departure_datetime", "arrival_datetime","airline","id", "price", "match_id","destination_city","origin_city").withColumnRenamed("destination_city","return_city").withColumnRenamed("origin_city","origin")
    
    val outboundFlight = getAverageFlightPriceDF(outboundFlightRaw, flightDF).withColumnRenamed("latest_id","departure_flight_id").drop("arrival_datetime","origin").withColumnRenamed("airline","airline_out").withColumnRenamed("price","out_price")
    
    
    inboundFlight.join(outboundFlight, inboundFlight("match_id")===outboundFlight("match_id") && inboundFlight("origin")===outboundFlight("return_city"), "cross").withColumn("flight_price", col("in_price")+col("out_price")).drop("in_price","out_price","return_city").withColumnRenamed("arrival_datetime","arrival_time").withColumnRenamed("departure_datetime","departure_time")
}


In [17]:
val flightOutput = getFlightDataFromMatch(matchCleaned)
z.show(flightOutput)

In [18]:
flightOutput.printSchema

In [19]:
val convertedDf = flightOutput
  .withColumn("arrival_date", $"arrival_time".cast("date"))
  .withColumn("departure_date", $"departure_time".cast("date"))

// Drop the original timestamp columns if no longer needed
val flightOutputDatetype = convertedDf.drop("arrival_time", "departure_time")



In [20]:
z.show(flightOutputDatetype)

In [21]:
flightOutputDatetype.printSchema

In [22]:
val hotelPath = "/user/yl12081_nyu_edu/hotel_data_clean_refined.parquet"
val hotelDataframe = spark.read.parquet(hotelPath)

In [23]:
z.show(hotelDataframe)

In [24]:
hotelDataframe.printSchema

In [25]:
import org.apache.spark.sql.DataFrame
def flightJoinWithHotel(flight: DataFrame, hotel: DataFrame): DataFrame = {
  val result = flight.alias("df1")
    .join(
      hotel.alias("df2"),
      lower(col("df1.city")) === lower(col("df2.city")) && // Case-insensitive city comparison
      col("df2.check_in_date").geq(col("df1.arrival_date")) && // checkinDate >= arrival_date
      col("df2.check_in_date").leq(date_add(col("df1.arrival_date"), 1)) && // checkinDate <= arrival_date + 1 day
      col("df2.check_out_date") === col("df1.departure_date"), // checkoutDate == departure_date
      "inner" // Inner join to only keep matching rows
    )
    
  result.withColumnRenamed("departure_date", "departure_time")
        .withColumnRenamed("arrival_date", "arrival_time")
}

In [26]:
val hotelOutput = flightJoinWithHotel(flightOutputDatetype, hotelDataframe)
z.show(hotelOutput)

In [27]:
val refinedHotelOutput = hotelOutput.drop("country","hotel_address","avg_review","square_feet","hotel_info_date","hotel_stay_duration")

In [28]:
z.show(refinedHotelOutput)

In [29]:
refinedHotelOutput.count