In [0]:
val filePath = "/user/yl12081_nyu_edu/hotel_data.csv"

In [1]:
val rawDF = spark.read
  .option("header", "true")
  .option("multiLine", "true")
  .option("inferSchema", "true")
  .option("escape", "\"")
  .csv(filePath)


In [2]:
z.show(rawDF)

In [3]:
val baseDF = rawDF.select(
  "name",
  "city",
  "country",
  "address",
  "avg_review",
  "property_type",
  "size",
  "checkin",
  "checkout",
  "price",
  "info_date",
)


In [4]:
baseDF.printSchema
baseDF.cache().count

In [5]:
val dfWithDateTypes = baseDF.withColumn("check_in_date", to_date($"checkin", "yyyy-MM-dd"))
                            .withColumn("check_out_date", to_date($"checkout", "yyyy-MM-dd"))

// Create a new column 'stayDuration' that shows the number of days between checkin and checkout
val dfWithStayDuration = dfWithDateTypes.withColumn("stay_duration", datediff($"check_out_date", $"check_in_date"))

z.show(dfWithStayDuration)

In [6]:
dfWithStayDuration.printSchema

In [7]:
// get the statistic from numeric columns
z.show(dfWithStayDuration.select("avg_review","size","price","stay_duration").summary())

In [8]:
import org.apache.spark.ml.feature.Imputer

val imputer = new Imputer()
  .setStrategy("median") // Set the strategy to use median for imputation
  .setInputCols(Array("size", "price", "avg_review", "stay_duration")) 
  .setOutputCols(Array("size", "price", "avg_review", "stay_duration")) 
  
val imputedDF = imputer.fit(dfWithStayDuration).transform(dfWithStayDuration)

In [9]:
z.show(imputedDF.select("size", "price", "avg_review", "stay_duration"))

In [10]:
z.show(imputedDF.select("size", "price", "avg_review", "stay_duration").summary())
imputedDF.count()

In [11]:
val posPricesDF = imputedDF.filter($"price" >= 50 && $"price" <= 1500)
posPricesDF.count()

In [12]:
val cleanDF = posPricesDF.filter($"size" >= 10 && $"size" <= 1000)
cleanDF.count()

In [13]:
z.show(cleanDF)

In [14]:
val hotelCleanDF = cleanDF.withColumn("hotel_id", monotonically_increasing_id())
                                                .withColumnRenamed("price","hotel_price")
                                                .withColumnRenamed("name","hotel_name")
                                                .withColumnRenamed("address","hotel_address")
                                                .withColumnRenamed("stay_duration","hotel_stay_duration")
                                                .withColumnRenamed("size","square_feet")
                                                .withColumnRenamed("info_date","hotel_info_date")
                                                .drop("checkin")
                                                .drop("checkout")
                                                .drop("property_type")

In [15]:
z.show(hotelCleanDF)

In [16]:
hotelCleanDF.printSchema

In [17]:
hotelCleanDF.count

In [18]:
val outputPath = "/user/yl12081_nyu_edu/hotel_data_clean_refined.parquet"

hotelCleanDF.write.mode("overwrite").parquet(outputPath)