In [0]:
import org.apache.spark.sql.expressions.Window
val filePath = "/user/avk3358_nyu_edu/project/data"
var matchDF = spark.read.json(filePath)
                        .withColumn("input_file", input_file_name())
z.show(matchDF)

In [1]:
matchDF.count()

In [2]:
matchDF.printSchema()

In [3]:
matchDF.select("date", "time").show(5)

In [4]:
val dateConvertedDF = matchDF.withColumn("date", to_date(col("date"), "dd MMM yyyy"))
                             .withColumn("time", to_timestamp(col("time"), "dd MMM yyyy HH:mm"))

In [5]:
dateConvertedDF.select("date", "time").show(5)

In [6]:
dateConvertedDF.printSchema()

In [7]:
val trimmedDF = dateConvertedDF.withColumn("city", lower(trim(col("city"))))
                        .withColumn("country", lower(trim(col("country"))))
                        .withColumn("sport", lower(trim(col("sport"))))
                        .withColumn("stadium", lower(trim(col("stadium"))))
                        .withColumn("tournament", lower(trim(col("tournament"))))
                        .withColumn("input_file", split(col("input_file"), "/").getItem(7))
val rightDF = trimmedDF.withColumn("access_date", to_date(split(col("input_file"), "\\.").getItem(0))).drop("input_file")
z.show(rightDF)

In [8]:
rightDF.printSchema

In [9]:
val badTeamsDF = rightDF.filter(size(col("teams")) > 2).select("teams")
z.show(badTeamsDF)

In [10]:
badTeamsDF.count

In [11]:
import scala.collection.mutable.HashSet
import scala.collection.mutable.ListBuffer
 
def modifyArray(inputArray: Array[String]): Array[String] = {
    val length = inputArray.size
    if(length == 2) return inputArray
    val table = HashSet[String]("Slo", "Wol", "Co", "Li", "Ala", "E", "Bayer Le", "Le Ha", "Ju", "GD Cha")
    val outBuffer = ListBuffer[String]()
    var i = 0
    while (i < length){
        var s = inputArray(i)
        if(s.contains("-")){
            s = inputArray(i).split("-")(1).trim()
        }
        if(table.contains(s)){
            outBuffer += (inputArray(i)+"v"+inputArray(i+1)).toLowerCase()
            i+=2
        }
        else{
            outBuffer += inputArray(i).toLowerCase()
            i+=1
        }
    }
    return outBuffer.toArray
}
val modifyTeamsArray = udf(modifyArray _)

In [12]:
val updatedDF = rightDF.withColumn("teams", modifyTeamsArray(col("teams"))).filter(size(col("teams")) === 2)
z.show(updatedDF.select("teams"))

In [13]:
updatedDF.filter(size(col("teams")) > 2).count()

In [14]:
updatedDF.printSchema

In [15]:
z.show(updatedDF)

In [16]:
updatedDF.select($"date", $"teams").distinct().count()

In [17]:

val w = Window.partitionBy($"date", $"teams").orderBy($"date".desc)

val uniqueMatchesDF = updatedDF.withColumn("rn", row_number.over(w)).where($"rn" === 1).drop("rn")
                                .withColumn("match_id", expr("uuid()"))
                                .drop($"access_date")
                                .drop($"tickets")

z.show(uniqueMatchesDF)

In [18]:
uniqueMatchesDF.count()

In [19]:
uniqueMatchesDF.printSchema

In [20]:
def getMatchesByTeams(teams: Array[String], matchesDF: org.apache.spark.sql.DataFrame): org.apache.spark.sql.DataFrame = {
    // Filter the DataFrame based on the given date and teams
    matchesDF
      .filter(array_contains($"teams", teams(0)) && array_contains($"teams", teams(1)))
}

In [21]:
val teams = Array("Borussia Dortmund", "Paris Saint-Germain")

val filteredMatchesDF = getMatchesByTeams(teams, uniqueMatchesDF)

// Show the filtered DataFrame
z.show(filteredMatchesDF)

In [22]:
val columnsFromDF1 = uniqueMatchesDF.columns

val combinedTicketsDF = updatedDF
  .join(uniqueMatchesDF.select($"date", $"teams", $"match_id"), Seq("date", "teams"), "inner")
  .select("*")

combinedTicketsDF.count()

In [23]:
z.show(combinedTicketsDF)

In [24]:
combinedTicketsDF.printSchema()

In [25]:
import org.apache.spark.sql.functions._

val explodedDF = combinedTicketsDF.withColumn("ticket", explode(col("tickets")))

val ticketDF = explodedDF.select(
  col("match_id"),
  col("access_date"),
  col("ticket.category").alias("ticket_category"),
  col("ticket.info").alias("ticket_info"),
  col("ticket.price").alias("ticket_price")
)

z.show(ticketDF)

In [26]:
ticketDF.count

In [27]:
import org.apache.spark.sql.types.DoubleType

val updatedTicketDF = ticketDF.withColumn("ticket_price", regexp_replace(col("ticket_price"), "[£, ]", "").cast(DoubleType))

z.show(updatedTicketDF)

In [28]:
updatedTicketDF.filter($"ticket_price".isNull).count()

In [29]:
updatedTicketDF.printSchema

In [30]:
val filteredTicketDF = updatedTicketDF.withColumn("ticket_info", lower(trim(regexp_replace(col("ticket_info"), "[*#-.:=]", ""))))
                                .withColumn("ticket_category", lower(trim(col("ticket_category"))))
z.show(filteredTicketDF)

In [31]:
uniqueMatchesDF.printSchema

In [32]:
filteredTicketDF.printSchema

In [33]:
val eventOutputPath = "/user/avk3358_nyu_edu/project/data/cleaned-event-df.parquet"

uniqueMatchesDF.write.mode("overwrite").parquet(eventOutputPath)

In [34]:
val ticketOutputPath = "/user/avk3358_nyu_edu/project/data/ticket-df.parquet"

filteredTicketDF.write.mode("overwrite").parquet(ticketOutputPath)