In [0]:
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Row, SparkSession}

val directoryPath = "/user/sb9509_nyu_edu/stocks"
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
val stockFiles = fs.listStatus(new Path(directoryPath)).filter(_.getPath.getName.endsWith(".csv")).map(_.getPath.toString)

val results = stockFiles.map { filePath =>
  val stockName = filePath.split("/").last.stripSuffix(".L.csv")
  
  val rawDf = spark.read.option("header", false).option("inferSchema", "true").csv(filePath)
  val filteredRDD = rawDf.rdd.zipWithIndex().filter { case (_, idx) => idx >= 3 }.map(_._1)
  val filteredDf = spark.createDataFrame(filteredRDD, rawDf.schema)
  val columnNames = Seq("Date", "AdjClose", "Close", "Open", "High", "Low", "Volume")
  val finalDf = filteredDf.toDF(columnNames: _*)
  val selectedDf = finalDf.select($"Date", $"Close").withColumn("Date", to_date($"Date", "yyyy-MM-dd")).withColumn("Close", $"Close".cast("double"))

  val startDate = selectedDf.agg(min("Date")).collect().head.getDate(0)
  val minValue = selectedDf.agg(min("Close")).collect().head.getDouble(0)
  val maxValue = selectedDf.agg(max("Close")).collect().head.getDouble(0)
  val nullCount = selectedDf.filter($"Close".isNull).count()
  val stdDevRow = selectedDf.agg(stddev("Close")).collect().head
  val stdDev = if (stdDevRow.isNullAt(0)) Double.NaN else stdDevRow.getDouble(0)


  (stockName, startDate, minValue, maxValue, nullCount, stdDev)
}


In [1]:
//Profiling

import spark.implicits._
val resultsDf = results.toSeq.toDF("StockSymbol", "StartDate", "MinValue", "MaxValue", "NullCount", "StdDev")

val valueDistribution = resultsDf.select("*")
valueDistribution.createOrReplaceTempView("value_distribution")

In [2]:
%sql
SELECT StockSymbol, StartDate, MaxValue, MinValue, NullCount, StdDev FROM value_distribution

In [3]:
%sql
SELECT 
    YEAR(CAST(StartDate AS DATE)) AS Year, 
    COUNT(DISTINCT StockSymbol) AS StockCount
FROM value_distribution
WHERE YEAR(CAST(StartDate AS DATE)) BETWEEN 2000 AND 2024
GROUP BY Year
ORDER BY Year


In [4]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window


stockFiles.foreach { filePath =>
  val stockName = filePath.split("/").last.stripSuffix(".L.csv")

  val rawDf = spark.read.option("header", false).option("inferSchema", "true").csv(filePath)
  val filteredRDD = rawDf.rdd.zipWithIndex().filter { case (_, idx) => idx >= 3 }.map(_._1)
  val filteredDf = spark.createDataFrame(filteredRDD, rawDf.schema)
  val columnNames = Seq("Date", "AdjClose", "Close", "Open", "High", "Low", "Volume")
  val finalDf = filteredDf.toDF(columnNames: _*)

  val selectedDf = finalDf.select($"Date", $"Close").withColumn("Date", to_date($"Date", "yyyy-MM-dd")).withColumn("Close", $"Close".cast("double"))


  val forwardFillSpec = Window.orderBy("Date").rowsBetween(Window.unboundedPreceding, 0)
  val backwardFillSpec = Window.orderBy("Date").rowsBetween(0, Window.unboundedFollowing)

  val cleanedDf = selectedDf.withColumn("Close", last($"Close", ignoreNulls = true).over(forwardFillSpec)).withColumn("Close", coalesce($"Close", first($"Close", ignoreNulls = true).over(backwardFillSpec)))

  cleanedDf.write.option("header", "true").mode("overwrite").csv(s"/user/sb9509_nyu_edu/stocks_cleaned/$stockName")
}
