In [1]:
# https://mungingdata.com/python/writing-parquet-pandas-pyspark-koalas/

In [2]:
infile = "./data/in/buses_nj_thru_2022_03_28.csv"
outpath = "./data/out/buses_nj_thru_2022_03_28_partitioned_csv_1mo"


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, DoubleType, TimestampType, LongType

spark = SparkSession.builder \
  .master("local") \
  .appName("partioning_project") \
  .config("spark.executor.cores", 3) \
  .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/21 20:18:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# # option 1, no schema
# df = spark.read.csv(infile, header = True)
# df.printSchema()

In [5]:
# # option 2, infer schema
# df = spark.read.csv(infile, inferSchema = True, header = True)
# df.printSchema()

In [6]:
# option 3, user-defined schema
schema = StructType() \
      .add("pkey",StringType(),True) \
      .add("lat",DoubleType(),True) \
      .add("lon",DoubleType(),True) \
      .add("cars",StringType(),True) \
      .add("consist",StringType(),True) \
      .add("d",StringType(),True) \
      .add("dn",StringType(),True) \
      .add("fs",StringType(),True) \
      .add("id",StringType(),True) \
      .add("m",StringType(),True) \
      .add("op",StringType(),True) \
      .add("pd",StringType(),True) \
      .add("pdrtpifeedname",StringType(),True) \
      .add("pid",StringType(),True) \
      .add("rt",StringType(),True) \
      .add("rtrtpifeedname",StringType(),True) \
      .add("rtdd",StringType(),True) \
      .add("rtpifeedname",StringType(),True) \
      .add("run",StringType(),True) \
      .add("wid1",StringType(),True) \
      .add("wid2",StringType(),True) \
      .add("timestamp",StringType(),True)
      # .add("timestamp",TimestampType(),True)
      # Spark doesn't recognize date64 types, and no way to create user-defined type
      # possibly a solution here we can implement after the import by mapping the timestamp column? 
      # https://arrow.apache.org/docs/python/timestamps.html


df = spark.read.csv(infile, header = True, schema = schema)

In [7]:
# https://sparkbyexamples.com/spark/pyspark-to_timestamp-convert-string-to-timestamp-type/
from pyspark.sql.functions import *

#Timestamp String to DateType
new_df = df.withColumn("timestamp",to_timestamp("timestamp"))

In [10]:
from pyspark.sql.functions import col
dates = ("2022-02-01",  "2022-02-28")
new_df2 = new_df.where(col('timestamp').between(*dates))


In [12]:
new_df2.coalesce(1).write \
    .mode('overwrite')\
    .parquet(outpath)

                                                                                