In [0]:
df = spark.read.table("samples.nyctaxi.trips")
df.printSchema()


root
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- pickup_zip: integer (nullable = true)
 |-- dropoff_zip: integer (nullable = true)



In [0]:
from pyspark.sql.functions import struct, col

df1 = df.withColumn(
    "pickup_info",
    struct(
        col("pickup_zip"),
        col("tpep_pickup_datetime")
    )
)

df1.select("pickup_info").show(5, truncate=False)
df1.printSchema()


+----------------------------+
|pickup_info                 |
+----------------------------+
|{10103, 2016-02-13 21:47:53}|
|{10023, 2016-02-13 18:29:09}|
|{10001, 2016-02-06 19:40:58}|
|{10044, 2016-02-12 19:06:43}|
|{10199, 2016-02-23 10:27:56}|
+----------------------------+
only showing top 5 rows
root
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- pickup_zip: integer (nullable = true)
 |-- dropoff_zip: integer (nullable = true)
 |-- pickup_info: struct (nullable = false)
 |    |-- pickup_zip: integer (nullable = true)
 |    |-- tpep_pickup_datetime: timestamp (nullable = true)



In [0]:
df1.select(
    col("pickup_info.pickup_zip"),
    col("pickup_info.tpep_pickup_datetime")
).show(5)


+----------+--------------------+
|pickup_zip|tpep_pickup_datetime|
+----------+--------------------+
|     10103| 2016-02-13 21:47:53|
|     10023| 2016-02-13 18:29:09|
|     10001| 2016-02-06 19:40:58|
|     10044| 2016-02-12 19:06:43|
|     10199| 2016-02-23 10:27:56|
+----------+--------------------+
only showing top 5 rows


In [0]:
df1.select(
    col("pickup_info").getField("pickup_zip").alias("zip"),
    col("pickup_info").getField("tpep_pickup_datetime").alias("pickup_time")
).show(5)


+-----+-------------------+
|  zip|        pickup_time|
+-----+-------------------+
|10103|2016-02-13 21:47:53|
|10023|2016-02-13 18:29:09|
|10001|2016-02-06 19:40:58|
|10044|2016-02-12 19:06:43|
|10199|2016-02-23 10:27:56|
+-----+-------------------+
only showing top 5 rows


In [0]:
from pyspark.sql.functions import lit

df2 = df1.withColumn(
    "pickup_info",
    col("pickup_info").withField("city", lit("NYC"))
)

df2.select("pickup_info").show(5, truncate=False)
df2.printSchema()


+---------------------------------+
|pickup_info                      |
+---------------------------------+
|{10103, 2016-02-13 21:47:53, NYC}|
|{10023, 2016-02-13 18:29:09, NYC}|
|{10001, 2016-02-06 19:40:58, NYC}|
|{10044, 2016-02-12 19:06:43, NYC}|
|{10199, 2016-02-23 10:27:56, NYC}|
+---------------------------------+
only showing top 5 rows
root
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- pickup_zip: integer (nullable = true)
 |-- dropoff_zip: integer (nullable = true)
 |-- pickup_info: struct (nullable = false)
 |    |-- pickup_zip: integer (nullable = true)
 |    |-- tpep_pickup_datetime: timestamp (nullable = true)
 |    |-- city: string (nullable = false)



In [0]:
df3 = df2.withColumn(
    "pickup_info",
    col("pickup_info").withField(
        "pickup_zip",
        col("pickup_info.pickup_zip").cast("int")
    )
)

df3.select("pickup_info").show(5, truncate=False)


+---------------------------------+
|pickup_info                      |
+---------------------------------+
|{10103, 2016-02-13 21:47:53, NYC}|
|{10023, 2016-02-13 18:29:09, NYC}|
|{10001, 2016-02-06 19:40:58, NYC}|
|{10044, 2016-02-12 19:06:43, NYC}|
|{10199, 2016-02-23 10:27:56, NYC}|
+---------------------------------+
only showing top 5 rows


In [0]:
df4 = df3.withColumn(
    "pickup_info",
    struct(
        col("pickup_info.pickup_zip").alias("zip"),
        col("pickup_info.tpep_pickup_datetime").alias("pickup_time"),
        col("pickup_info.city")
    )
)

df4.select("pickup_info").show(5, truncate=False)
df4.printSchema()


+---------------------------------+
|pickup_info                      |
+---------------------------------+
|{10103, 2016-02-13 21:47:53, NYC}|
|{10023, 2016-02-13 18:29:09, NYC}|
|{10001, 2016-02-06 19:40:58, NYC}|
|{10044, 2016-02-12 19:06:43, NYC}|
|{10199, 2016-02-23 10:27:56, NYC}|
+---------------------------------+
only showing top 5 rows
root
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- pickup_zip: integer (nullable = true)
 |-- dropoff_zip: integer (nullable = true)
 |-- pickup_info: struct (nullable = false)
 |    |-- zip: integer (nullable = true)
 |    |-- pickup_time: timestamp (nullable = true)
 |    |-- city: string (nullable = false)



In [0]:
df = spark.read.table("samples.nyctaxi.trips")
df.printSchema()
df.show(5)


root
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- pickup_zip: integer (nullable = true)
 |-- dropoff_zip: integer (nullable = true)

+--------------------+---------------------+-------------+-----------+----------+-----------+
|tpep_pickup_datetime|tpep_dropoff_datetime|trip_distance|fare_amount|pickup_zip|dropoff_zip|
+--------------------+---------------------+-------------+-----------+----------+-----------+
| 2016-02-13 21:47:53|  2016-02-13 21:57:15|          1.4|        8.0|     10103|      10110|
| 2016-02-13 18:29:09|  2016-02-13 18:37:23|         1.31|        7.5|     10023|      10023|
| 2016-02-06 19:40:58|  2016-02-06 19:52:32|          1.8|        9.5|     10001|      10018|
| 2016-02-12 19:06:43|  2016-02-12 19:20:54|          2.3|       11.5|     10044|      10111|
| 2016-02-23 10:27:56|  2016-02-23 10:58:33|      

In [0]:
from pyspark.sql.functions import create_map, col, lit

df1 = df.withColumn(
    "trip_metrics",
    create_map(
        lit("fare"), col("fare_amount"),
        lit("distance"), col("trip_distance")
    )
)

df1.select("trip_metrics").show(5, truncate=False)
df1.printSchema()


+-------------------------------+
|trip_metrics                   |
+-------------------------------+
|{fare -> 8.0, distance -> 1.4} |
|{fare -> 7.5, distance -> 1.31}|
|{fare -> 9.5, distance -> 1.8} |
|{fare -> 11.5, distance -> 2.3}|
|{fare -> 18.5, distance -> 2.6}|
+-------------------------------+
only showing top 5 rows
root
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- pickup_zip: integer (nullable = true)
 |-- dropoff_zip: integer (nullable = true)
 |-- trip_metrics: map (nullable = false)
 |    |-- key: string
 |    |-- value: double (valueContainsNull = true)



In [0]:
from pyspark.sql.functions import map_keys

df1.select(map_keys(col("trip_metrics")).alias("keys")).show(5)


+----------------+
|            keys|
+----------------+
|[fare, distance]|
|[fare, distance]|
|[fare, distance]|
|[fare, distance]|
|[fare, distance]|
+----------------+
only showing top 5 rows


In [0]:
from pyspark.sql.functions import map_values

df1.select(map_values(col("trip_metrics")).alias("values")).show(5)


+-----------+
|     values|
+-----------+
| [8.0, 1.4]|
|[7.5, 1.31]|
| [9.5, 1.8]|
|[11.5, 2.3]|
|[18.5, 2.6]|
+-----------+
only showing top 5 rows
