In [None]:
# https://mungingdata.com/python/writing-parquet-pandas-pyspark-koalas/

In [None]:
infile = "./data/in/buses_nj_thru_2022_03_28.csv"
outpath = "./data/out/buses_nj_thru_2022_03_28_partitioned_csv_full"
outpath2 = "./data/out/buses_nj_thru_2022_03_28_partitioned_csv_full_1_file_per_partition"
outpath3 = "./data/out/buses_nj_thru_2022_03_28_partitioned_csv_full_1_file_per_partition_by_hour"

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, DoubleType, TimestampType, LongType

spark = SparkSession.builder \
  .master("local") \
  .appName("partioning_project") \
  .config("spark.executor.cores", 3) \
  .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/07 07:27:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/04/07 07:27:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/04/07 07:27:14 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
# # option 1, no schema
# df = spark.read.csv(infile, header = True)
# df.printSchema()

In [5]:
# # option 2, infer schema
# df = spark.read.csv(infile, inferSchema = True, header = True)
# df.printSchema()

In [6]:
# option 3, user-defined schema
schema = StructType() \
      .add("pkey",StringType(),True) \
      .add("lat",DoubleType(),True) \
      .add("lon",DoubleType(),True) \
      .add("cars",StringType(),True) \
      .add("consist",StringType(),True) \
      .add("d",StringType(),True) \
      .add("dn",StringType(),True) \
      .add("fs",StringType(),True) \
      .add("id",StringType(),True) \
      .add("m",StringType(),True) \
      .add("op",StringType(),True) \
      .add("pd",StringType(),True) \
      .add("pdrtpifeedname",StringType(),True) \
      .add("pid",StringType(),True) \
      .add("rt",StringType(),True) \
      .add("rtrtpifeedname",StringType(),True) \
      .add("rtdd",StringType(),True) \
      .add("rtpifeedname",StringType(),True) \
      .add("run",StringType(),True) \
      .add("wid1",StringType(),True) \
      .add("wid2",StringType(),True) \
      .add("timestamp",StringType(),True)
      # .add("timestamp",TimestampType(),True)
      # Spark doesn't recognize date64 types, and no way to create user-defined type
      # possibly a solution here we can implement after the import by mapping the timestamp column? 
      # https://arrow.apache.org/docs/python/timestamps.html


df = spark.read.csv(infile, header = True, schema = schema)

In [7]:
df.printSchema()

root
 |-- pkey: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- cars: string (nullable = true)
 |-- consist: string (nullable = true)
 |-- d: string (nullable = true)
 |-- dn: string (nullable = true)
 |-- fs: string (nullable = true)
 |-- id: string (nullable = true)
 |-- m: string (nullable = true)
 |-- op: string (nullable = true)
 |-- pd: string (nullable = true)
 |-- pdrtpifeedname: string (nullable = true)
 |-- pid: string (nullable = true)
 |-- rt: string (nullable = true)
 |-- rtrtpifeedname: string (nullable = true)
 |-- rtdd: string (nullable = true)
 |-- rtpifeedname: string (nullable = true)
 |-- run: string (nullable = true)
 |-- wid1: string (nullable = true)
 |-- wid2: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [8]:
df.show(5)

+----+-------+--------+----+-------+----------------+---+--------------------+-----+---+------+--------------------+--------------+----+---+--------------+----+------------+---+----+----+-------------------+
|pkey|    lat|     lon|cars|consist|               d| dn|                  fs|   id|  m|    op|                  pd|pdrtpifeedname| pid| rt|rtrtpifeedname|rtdd|rtpifeedname|run|wid1|wid2|          timestamp|
+----+-------+--------+----+-------+----------------+---+--------------------+-----+---+------+--------------------+--------------+----+---+--------------+----+------------+---+----+----+-------------------+
|   1| 40.722|-74.2819|null|   null|      West Bound|WNW|           70 SUMMIT| 5819|  1|482580|Livingston/Florha...|            \N|1666| 70|            \N|  70|          \N| 14|0014|0070|2021-04-05 23:16:58|
|   2|40.7344|-74.1894|null|   null|      East Bound|ENE|25 SPRINGFIELD AV...| 5820|  1|547664|Doremus Avenue (N...|            \N| 737| 25|            \N|  25|        

In [9]:
# https://sparkbyexamples.com/spark/pyspark-to_timestamp-convert-string-to-timestamp-type/
from pyspark.sql.functions import *

#Timestamp String to DateType
new_df = df.withColumn("timestamp",to_timestamp("timestamp"))

In [10]:
new_df.printSchema()

root
 |-- pkey: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- cars: string (nullable = true)
 |-- consist: string (nullable = true)
 |-- d: string (nullable = true)
 |-- dn: string (nullable = true)
 |-- fs: string (nullable = true)
 |-- id: string (nullable = true)
 |-- m: string (nullable = true)
 |-- op: string (nullable = true)
 |-- pd: string (nullable = true)
 |-- pdrtpifeedname: string (nullable = true)
 |-- pid: string (nullable = true)
 |-- rt: string (nullable = true)
 |-- rtrtpifeedname: string (nullable = true)
 |-- rtdd: string (nullable = true)
 |-- rtpifeedname: string (nullable = true)
 |-- run: string (nullable = true)
 |-- wid1: string (nullable = true)
 |-- wid2: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [11]:
new_df.show(5)

+----+-------+--------+----+-------+----------------+---+--------------------+-----+---+------+--------------------+--------------+----+---+--------------+----+------------+---+----+----+-------------------+
|pkey|    lat|     lon|cars|consist|               d| dn|                  fs|   id|  m|    op|                  pd|pdrtpifeedname| pid| rt|rtrtpifeedname|rtdd|rtpifeedname|run|wid1|wid2|          timestamp|
+----+-------+--------+----+-------+----------------+---+--------------------+-----+---+------+--------------------+--------------+----+---+--------------+----+------------+---+----+----+-------------------+
|   1| 40.722|-74.2819|null|   null|      West Bound|WNW|           70 SUMMIT| 5819|  1|482580|Livingston/Florha...|            \N|1666| 70|            \N|  70|          \N| 14|0014|0070|2021-04-05 23:16:58|
|   2|40.7344|-74.1894|null|   null|      East Bound|ENE|25 SPRINGFIELD AV...| 5820|  1|547664|Doremus Avenue (N...|            \N| 737| 25|            \N|  25|        

In [12]:
# new_df.write \
#     .mode('overwrite')\
#     .partitionBy("rt")\
#     .parquet(outpath)

In [13]:
# # write it with 1 file per partition (eliminate step 2)

# new_df.repartition("rt") \
#     .write \
#     .mode('overwrite')\
#     .partitionBy("rt")\
#     .parquet(outpath2)

# add a date hour columns and partition on that

In [14]:
new_df \
    .withColumn("year", year(col("timestamp"))) \
    .withColumn("month", month(col("timestamp"))) \
    .withColumn("day", dayofmonth(col("timestamp"))) \
    .withColumn("hour", hour(col("timestamp"))) \
    .repartition("year", "month", "day", "hour") \
    .write \
    .mode('overwrite')\
    .partitionBy("year", "month", "day", "hour") \
    .parquet(outpath3)

22/04/07 07:27:17 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                