In [1]:
import pandas as pd

data = pd.read_excel("superstore.xls")
superstore = spark.createDataFrame(data)



In [2]:
#df.printSchema() shows the schema

superstore.printSchema()

root
 |-- Row ID: long (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Order Date: timestamp (nullable = true)
 |-- Ship Date: timestamp (nullable = true)
 |-- Ship Mode: string (nullable = true)
 |-- Customer ID: string (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal Code: double (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product ID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Sub-Category: string (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Sales: double (nullable = true)
 |-- Quantity: long (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Profit: double (nullable = true)



In [11]:
#df.selectExpr("`col name` as id") gives a df with col name renamed in display

superstore.selectExpr("`Order ID` as id").show(5)

+--------------+
|            id|
+--------------+
|CA-2017-152156|
|CA-2017-152156|
|CA-2017-138688|
|US-2016-108966|
|US-2016-108966|
+--------------+
only showing top 5 rows



In [14]:
#df.where("filter condition") is used to filter the data based on a given logical condition

superstore.where("Quantity > 10")\
          .selectExpr("`Order ID`", "`Postal Code`", "`Quantity`")\
          .show(5)

+--------------+-----------+--------+
|      Order ID|Postal Code|Quantity|
+--------------+-----------+--------+
|CA-2015-115259|    43229.0|      14|
|CA-2017-145583|    95661.0|      14|
|CA-2017-114489|    53132.0|      11|
|CA-2017-145625|    92037.0|      13|
|CA-2015-122336|    19140.0|      13|
+--------------+-----------+--------+
only showing top 5 rows



In [7]:
from pyspark.sql.functions import column

#df.sort(col("col name")).desc() orders the dataframe in descending order

superstore.where(column("Category") == "Furniture")\
          .select("Order ID", "Postal Code", "Quantity")\
          .sort(column("Quantity").desc())\
          .show(5)

+--------------+-----------+--------+
|      Order ID|Postal Code|Quantity|
+--------------+-----------+--------+
|CA-2016-104241|    22304.0|      14|
|CA-2015-120768|    35630.0|      14|
|CA-2015-163447|    10011.0|      14|
|CA-2018-152702|    61107.0|      14|
|CA-2017-145583|    95661.0|      14|
+--------------+-----------+--------+
only showing top 5 rows



In [15]:
#multiple df.where fucntions can be used to filter more conditions

superstore.where(column("Category") != "Furniture")\
          .where(column("Region") == "West")\
          .selectExpr("Category", "Region", "Quantity")\
          .show(5)

+---------------+------+--------+
|       Category|Region|Quantity|
+---------------+------+--------+
|Office Supplies|  West|       2|
|Office Supplies|  West|       4|
|     Technology|  West|       6|
|Office Supplies|  West|       3|
|Office Supplies|  West|       5|
+---------------+------+--------+
only showing top 5 rows



In [19]:
#df.createOrReplaceTempView("view name") can be used to create temp view which can be used to query tables using spark.sql

superstore.createOrReplaceTempView("superstore_view")

spark.sql("""DESCRIBE FORMATTED superstore_view""").show(21)

+-------------+---------+-------+
|     col_name|data_type|comment|
+-------------+---------+-------+
|       Row ID|   bigint|   null|
|     Order ID|   string|   null|
|   Order Date|timestamp|   null|
|    Ship Date|timestamp|   null|
|    Ship Mode|   string|   null|
|  Customer ID|   string|   null|
|Customer Name|   string|   null|
|      Segment|   string|   null|
|      Country|   string|   null|
|         City|   string|   null|
|        State|   string|   null|
|  Postal Code|   double|   null|
|       Region|   string|   null|
|   Product ID|   string|   null|
|     Category|   string|   null|
| Sub-Category|   string|   null|
| Product Name|   string|   null|
|        Sales|   double|   null|
|     Quantity|   bigint|   null|
|     Discount|   double|   null|
|       Profit|   double|   null|
+-------------+---------+-------+



In [20]:
#spark.sql("SQL QUERY") can be used to query temp views

spark.sql("SELECT Category, Region, Quantity FROM superstore_view").show(5)

+---------------+------+--------+
|       Category|Region|Quantity|
+---------------+------+--------+
|      Furniture| South|       2|
|      Furniture| South|       3|
|Office Supplies|  West|       2|
|      Furniture| South|       5|
|Office Supplies| South|       2|
+---------------+------+--------+
only showing top 5 rows



In [21]:
spark.sql("""SELECT  Category, Region, SUM(Quantity)
FROM superstore_view 
GROUP BY Category, Region
ORDER BY SUM(Quantity) DESC LIMIT 10""")

DataFrame[Category: string, Region: string, sum(Quantity): bigint]

In [22]:
spark.sql(""" SELECT Category, (Region = "Furniture" AND Quantity > 10 OR SALES > 1000) as test
FROM superstore_view WHERE Region = "Furniture" AND Quantity > 10 OR SALES > 1000""").show(5)

+----------+----+
|  Category|test|
+----------+----+
| Furniture|true|
| Furniture|true|
| Furniture|true|
|Technology|true|
|Technology|true|
+----------+----+
only showing top 5 rows



In [11]:
#df.describe() can be used to display the summary of the dataframe
#df.toPandas() can be used to convert a spark dataframe into pandas dataframe

superstore.describe().toPandas().T

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Row ID,9994,4997.5,2885.1636290974325,1,9994
Order ID,9994,,,CA-2015-100006,US-2018-169551
Ship Mode,9994,,,First Class,Standard Class
Customer ID,9994,,,AA-10315,ZD-21925
Customer Name,9994,,,Aaron Bergman,Zuschuss Donatelli
Segment,9994,,,Consumer,Home Office
Country,9994,,,United States,United States
City,9994,,,Aberdeen,Yuma
State,9994,,,Alabama,Wyoming


In [23]:
#df.distinct() gives the distinct values in a selected column

superstore.select("State").distinct().sort("State").show(5)

+----------+
|     State|
+----------+
|   Alabama|
|   Arizona|
|  Arkansas|
|California|
|  Colorado|
+----------+
only showing top 5 rows



In [24]:
#df.sample() can be used to randomly select a sample of data from a dataframe

seed = 5
withReplacement = False
fraction = 0.3

superstore.sample(withReplacement, fraction, seed).count()

3002

In [25]:
#df.randomSplit() can be used to split the data into two fractions

df = superstore.randomSplit([0.25, 0.75], seed = 5)

In [26]:
print("Test: ", df[0].count(),"\n","Train: ", df[1].count())

Test:  2527 
 Train:  7467


In [27]:
#df.union(df2) can be used to join two tables of same columns

test = df[0]
train = df[1]

new = test.union(train)
new.count()

9994

In [28]:
from pyspark.sql.functions import desc, asc, col

newdf = superstore.select("State").distinct()

In [18]:
newdf.count()

49

In [19]:
newdf.show(5)

+---------+
|    State|
+---------+
|     Utah|
|Minnesota|
|     Ohio|
|   Oregon|
| Arkansas|
+---------+
only showing top 5 rows



In [20]:
newdf.sort("State").show(5)

+----------+
|     State|
+----------+
|   Alabama|
|   Arizona|
|  Arkansas|
|California|
|  Colorado|
+----------+
only showing top 5 rows



In [21]:
#df.orderBy(col("col name")) can also be used to sort a dataframe column

from pyspark.sql.functions import desc

newdf.orderBy(col("State").desc()).show(5)

+-------------+
|        State|
+-------------+
|      Wyoming|
|    Wisconsin|
|West Virginia|
|   Washington|
|     Virginia|
+-------------+
only showing top 5 rows



In [22]:
superstore.rdd.getNumPartitions()

4

In [23]:
newdf = superstore.repartition(8)

print(newdf.rdd.getNumPartitions())

8


In [24]:
newdf = newdf.coalesce(2)

print(newdf.rdd.getNumPartitions())

2


In [25]:
superstore.rdd.getNumPartitions()

4

In [26]:
superstore.where(col("State") != "Washington")\
.select("State", "Quantity")\
.groupBy("State")\
.sum("Quantity")\
.withColumnRenamed("sum(Quantity)", "Quantity")\
.orderBy(col("Quantity").desc())\
.show(5)

+------------+--------+
|       State|Quantity|
+------------+--------+
|  California|    7667|
|    New York|    4224|
|       Texas|    3724|
|Pennsylvania|    2153|
|    Illinois|    1845|
+------------+--------+
only showing top 5 rows



In [27]:
quantFilter = col("Quantity") > 5
catFilter = col("Category") != "Furniture"
cols = ["State", "Ship Mode", "Quantity"]
superstore.where(quantFilter | catFilter)\
.select("State", "Ship Mode", "Quantity")\
.groupBy("State", "Ship Mode")\
.sum("Quantity")\
.withColumnRenamed("sum(Quantity)", "Quantity")\
.orderBy(cols, ascending = True)\
.show(5)

+-------+--------------+--------+
|  State|     Ship Mode|Quantity|
+-------+--------------+--------+
|Alabama|   First Class|      33|
|Alabama|      Same Day|       2|
|Alabama|  Second Class|      77|
|Alabama|Standard Class|     133|
|Arizona|   First Class|     149|
+-------+--------------+--------+
only showing top 5 rows



In [28]:
from pyspark.sql.functions import corr

superstore.stat.corr("Quantity", "Profit")
superstore.select(corr("Quantity", "Profit")).show()

+----------------------+
|corr(Quantity, Profit)|
+----------------------+
|   0.06625318912428482|
+----------------------+



In [29]:
superstore.select("Sales", "Profit", "Quantity", "Discount").describe().show()

+-------+-------------------+------------------+------------------+-------------------+
|summary|              Sales|            Profit|          Quantity|           Discount|
+-------+-------------------+------------------+------------------+-------------------+
|  count|               9994|              9994|              9994|               9994|
|   mean| 229.85800083049847|  28.6568963077847| 3.789573744246548|0.15620272163297735|
| stddev|   623.245100508681| 234.2601076909574|2.2251096911414012|0.20645196782571612|
|    min|0.44399999999999995|-6599.978000000001|                 1|                0.0|
|    max|           22638.48| 8399.975999999999|                14|                0.8|
+-------+-------------------+------------------+------------------+-------------------+



In [30]:
colName = "Sales"
quantileProbs = [0.25, 0.5, 0.75]
relError = 0.05

for i in superstore.stat.approxQuantile(colName, quantileProbs, relError):
    print(round(i, 3))

18.24
59.76
206.112


In [31]:
from pyspark.sql.functions import monotonically_increasing_id
superstore.select(monotonically_increasing_id()).withColumnRenamed("monotonically_increasing_id()", "ROW_ID").show(5)

+------+
|ROW_ID|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
+------+
only showing top 5 rows



In [32]:
from pyspark.sql.functions import initcap, lower, upper, lit

superstore.select(initcap(lit("test row")), upper(lit("test row")), lower(lit("TEST ROW"))).show(5)

+-----------------+---------------+---------------+
|initcap(test row)|upper(test row)|lower(TEST ROW)|
+-----------------+---------------+---------------+
|         Test Row|       TEST ROW|       test row|
|         Test Row|       TEST ROW|       test row|
|         Test Row|       TEST ROW|       test row|
|         Test Row|       TEST ROW|       test row|
|         Test Row|       TEST ROW|       test row|
+-----------------+---------------+---------------+
only showing top 5 rows



In [33]:
from pyspark.sql.functions import ltrim, rtrim, lpad, rpad, trim

superstore.select(ltrim(lit("          hello         ")).alias("ltrim"),
                 rtrim(lit("         hello         ")).alias("rtrim"),
                 trim(lit("           hello        ")).alias("trim"),
                 rpad(lit("hello"), 10, " ").alias("rpad"),
                 lpad(lit("hello"), 3, " ").alias("lpad")).show(5)


+--------------+--------------+-----+----------+----+
|         ltrim|         rtrim| trim|      rpad|lpad|
+--------------+--------------+-----+----------+----+
|hello         |         hello|hello|hello     | hel|
|hello         |         hello|hello|hello     | hel|
|hello         |         hello|hello|hello     | hel|
|hello         |         hello|hello|hello     | hel|
|hello         |         hello|hello|hello     | hel|
+--------------+--------------+-----+----------+----+
only showing top 5 rows



In [34]:
from pyspark.sql.functions import regexp_replace

regex_string = "2017|2018"

superstore.select(regexp_replace(col("Order ID"), regex_string, "latest").alias("latest orders")).show(5)

+----------------+
|   latest orders|
+----------------+
|CA-latest-152156|
|CA-latest-152156|
|CA-latest-138688|
|  US-2016-108966|
|  US-2016-108966|
+----------------+
only showing top 5 rows



In [35]:
from pyspark.sql.functions import translate

superstore.select(translate(col("State"), "AEIOUaeiou", "0123456789").alias("translated state")).show(5)

+----------------+
|translated state|
+----------------+
|        K6nt9cky|
|        K6nt9cky|
|      C5l7f8rn75|
|         Fl8r7d5|
|         Fl8r7d5|
+----------------+
only showing top 5 rows



In [36]:
from pyspark.sql.functions import current_date, current_timestamp

dateDF = spark.range(10)\
.withColumn("today", current_date())\
.withColumn("now", current_timestamp())

dateDF.show(5, False)

+---+----------+----------------------+
|id |today     |now                   |
+---+----------+----------------------+
|0  |2019-01-17|2019-01-17 11:15:55.82|
|1  |2019-01-17|2019-01-17 11:15:55.82|
|2  |2019-01-17|2019-01-17 11:15:55.82|
|3  |2019-01-17|2019-01-17 11:15:55.82|
|4  |2019-01-17|2019-01-17 11:15:55.82|
+---+----------+----------------------+
only showing top 5 rows



In [37]:
dateDF.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



In [38]:
type(dateDF)

pyspark.sql.dataframe.DataFrame

In [39]:
dateDF.columns

['id', 'today', 'now']

In [40]:
dateDF.rdd.getNumPartitions()

4

In [41]:
from pyspark.sql.functions import date_add, date_sub, date_trunc

dateDF.select(date_add(col("today"), 5), 
              date_sub(col("today"), 5), 
              date_trunc("yyyy", col("today"))).show(5)

+------------------+------------------+-----------------------+
|date_add(today, 5)|date_sub(today, 5)|date_trunc(yyyy, today)|
+------------------+------------------+-----------------------+
|        2019-01-22|        2019-01-12|    2019-01-01 00:00:00|
|        2019-01-22|        2019-01-12|    2019-01-01 00:00:00|
|        2019-01-22|        2019-01-12|    2019-01-01 00:00:00|
|        2019-01-22|        2019-01-12|    2019-01-01 00:00:00|
|        2019-01-22|        2019-01-12|    2019-01-01 00:00:00|
+------------------+------------------+-----------------------+
only showing top 5 rows



In [42]:
from pyspark.sql.functions import to_date

spark.range(5).withColumn("date", lit("2019-01-01")).select(to_date(col("date"))).show()

+---------------+
|to_date(`date`)|
+---------------+
|     2019-01-01|
|     2019-01-01|
|     2019-01-01|
|     2019-01-01|
|     2019-01-01|
+---------------+



In [43]:
from pyspark.sql.functions import datediff, months_between, to_date

dateDF.withColumn("week_ago", date_sub(col("today"), 7)).select(datediff(col("week_ago"), col("today"))).show(5)

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
|                       -7|
|                       -7|
|                       -7|
|                       -7|
+-------------------------+
only showing top 5 rows



In [44]:
dateDF.select(to_date(lit("2016-04-01")).alias("start"),
              to_date(lit("2016-05-21")).alias("end"))\
              .select(months_between(col("start"), col("end"))).show(5)

+--------------------------+
|months_between(start, end)|
+--------------------------+
|               -1.64516129|
|               -1.64516129|
|               -1.64516129|
|               -1.64516129|
|               -1.64516129|
+--------------------------+
only showing top 5 rows



In [45]:
dateDF.select(to_date(lit("01-04-1991"), "dd-MM-yyyy").alias("start"),
             to_date(lit("02-01-2019"), "dd-MM-yyyy").alias("end"))\
            .select(months_between(col("end"), col("start"))).show(5)

+--------------------------+
|months_between(end, start)|
+--------------------------+
|              333.03225806|
|              333.03225806|
|              333.03225806|
|              333.03225806|
|              333.03225806|
+--------------------------+
only showing top 5 rows



In [46]:
from pyspark.sql.functions import coalesce

superstore.select(coalesce(col("City"), col("Product Name"))).show(5)

+----------------------------+
|coalesce(City, Product Name)|
+----------------------------+
|                   Henderson|
|                   Henderson|
|                 Los Angeles|
|             Fort Lauderdale|
|             Fort Lauderdale|
+----------------------------+
only showing top 5 rows



In [47]:
superstore.na.drop().show(5)

+------+--------------+-------------------+-------------------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+------------------+--------+--------+-------------------+
|Row ID|      Order ID|         Order Date|          Ship Date|     Ship Mode|Customer ID|  Customer Name|  Segment|      Country|           City|     State|Postal Code|Region|     Product ID|       Category|Sub-Category|        Product Name|             Sales|Quantity|Discount|             Profit|
+------+--------------+-------------------+-------------------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+------------------+--------+--------+-------------------+
|     1|CA-2017-152156|2017-11-08 00:00:00|2017-11-11 00:00:00|  Second Class|   CG-12520|    Claire

In [48]:
superstore.na.drop("all").show(5)

+------+--------------+-------------------+-------------------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+------------------+--------+--------+-------------------+
|Row ID|      Order ID|         Order Date|          Ship Date|     Ship Mode|Customer ID|  Customer Name|  Segment|      Country|           City|     State|Postal Code|Region|     Product ID|       Category|Sub-Category|        Product Name|             Sales|Quantity|Discount|             Profit|
+------+--------------+-------------------+-------------------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+------------------+--------+--------+-------------------+
|     1|CA-2017-152156|2017-11-08 00:00:00|2017-11-11 00:00:00|  Second Class|   CG-12520|    Claire

In [49]:
superstore.na.drop("all").show(5)

+------+--------------+-------------------+-------------------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+------------------+--------+--------+-------------------+
|Row ID|      Order ID|         Order Date|          Ship Date|     Ship Mode|Customer ID|  Customer Name|  Segment|      Country|           City|     State|Postal Code|Region|     Product ID|       Category|Sub-Category|        Product Name|             Sales|Quantity|Discount|             Profit|
+------+--------------+-------------------+-------------------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+------------------+--------+--------+-------------------+
|     1|CA-2017-152156|2017-11-08 00:00:00|2017-11-11 00:00:00|  Second Class|   CG-12520|    Claire

In [50]:
superstore.na.drop("all", subset = ["Order Id", "Order Date"]).show(5)

+------+--------------+-------------------+-------------------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+------------------+--------+--------+-------------------+
|Row ID|      Order ID|         Order Date|          Ship Date|     Ship Mode|Customer ID|  Customer Name|  Segment|      Country|           City|     State|Postal Code|Region|     Product ID|       Category|Sub-Category|        Product Name|             Sales|Quantity|Discount|             Profit|
+------+--------------+-------------------+-------------------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+------------------+--------+--------+-------------------+
|     1|CA-2017-152156|2017-11-08 00:00:00|2017-11-11 00:00:00|  Second Class|   CG-12520|    Claire

In [51]:
superstore.na.fill("All null values become this string").show(5)

+------+--------------+-------------------+-------------------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+------------------+--------+--------+-------------------+
|Row ID|      Order ID|         Order Date|          Ship Date|     Ship Mode|Customer ID|  Customer Name|  Segment|      Country|           City|     State|Postal Code|Region|     Product ID|       Category|Sub-Category|        Product Name|             Sales|Quantity|Discount|             Profit|
+------+--------------+-------------------+-------------------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+------------------+--------+--------+-------------------+
|     1|CA-2017-152156|2017-11-08 00:00:00|2017-11-11 00:00:00|  Second Class|   CG-12520|    Claire

In [52]:
superstore.na.replace([" "], ["UNKNOWN"], "Description").show(5)

+------+--------------+-------------------+-------------------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+------------------+--------+--------+-------------------+
|Row ID|      Order ID|         Order Date|          Ship Date|     Ship Mode|Customer ID|  Customer Name|  Segment|      Country|           City|     State|Postal Code|Region|     Product ID|       Category|Sub-Category|        Product Name|             Sales|Quantity|Discount|             Profit|
+------+--------------+-------------------+-------------------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+------------------+--------+--------+-------------------+
|     1|CA-2017-152156|2017-11-08 00:00:00|2017-11-11 00:00:00|  Second Class|   CG-12520|    Claire

In [53]:
#struct funtion is used to create a complex column by combining multiple columns so that they can be later queried

from pyspark.sql.functions import struct

df_test = superstore.select(struct("Row ID", "Order ID").alias("complex"))

In [54]:
df_test.select("complex.Order ID").show(5)

+--------------+
|      Order ID|
+--------------+
|CA-2017-152156|
|CA-2017-152156|
|CA-2017-138688|
|US-2016-108966|
|US-2016-108966|
+--------------+
only showing top 5 rows



In [55]:
#split function is used to split rows of a column into arrays

from pyspark.sql.functions import split

superstore.select(split(col("Customer Name"), " ").alias("First_and_Last_Names"))\
.selectExpr("First_and_Last_Names[1]")\
.show(5)

+-----------------------+
|First_and_Last_Names[1]|
+-----------------------+
|                   Gute|
|                   Gute|
|                    Van|
|              O'Donnell|
|              O'Donnell|
+-----------------------+
only showing top 5 rows



In [56]:
#size function can be used to find the size of the array

from pyspark.sql.functions import size, array_contains

superstore.select(size(split(col("Customer Name"), " ")).alias("name_split")).show(5)

+----------+
|name_split|
+----------+
|         2|
|         2|
|         3|
|         2|
|         2|
+----------+
only showing top 5 rows



In [57]:
#array_contains can be used to check whether the array contains a given value

superstore.select(array_contains(split(col("Customer Name"), " "), "Hoffman").alias("is_hoffman")).show(5)

+----------+
|is_hoffman|
+----------+
|     false|
|     false|
|     false|
|     false|
|     false|
+----------+
only showing top 5 rows



In [58]:
#explode function can be used to create new rows from the indicidual values of an array

from pyspark.sql.functions import explode

superstore.withColumn("splitted", split(col("Customer Name"), " "))\
.withColumn("exploded", explode(col("splitted")))\
.select("Customer Name","splitted", "exploded").show(5)

+---------------+-------------------+--------+
|  Customer Name|           splitted|exploded|
+---------------+-------------------+--------+
|    Claire Gute|     [Claire, Gute]|  Claire|
|    Claire Gute|     [Claire, Gute]|    Gute|
|    Claire Gute|     [Claire, Gute]|  Claire|
|    Claire Gute|     [Claire, Gute]|    Gute|
|Darrin Van Huff|[Darrin, Van, Huff]|  Darrin|
+---------------+-------------------+--------+
only showing top 5 rows



In [59]:
#map function can be used to create key value pairs of columns

from pyspark.sql.functions import create_map

superstore.select(create_map(col("Customer Name"), col("Order ID")).alias("mapped")).show(5, False)

+-----------------------------------+
|mapped                             |
+-----------------------------------+
|[Claire Gute -> CA-2017-152156]    |
|[Claire Gute -> CA-2017-152156]    |
|[Darrin Van Huff -> CA-2017-138688]|
|[Sean O'Donnell -> US-2016-108966] |
|[Sean O'Donnell -> US-2016-108966] |
+-----------------------------------+
only showing top 5 rows



In [60]:
#maps can be queried

superstore.select(create_map(col("Customer Name"), col("Order ID")).alias("mapped"))\
.selectExpr("mapped['Claire Gute']").show(5)

+-------------------+
|mapped[Claire Gute]|
+-------------------+
|     CA-2017-152156|
|     CA-2017-152156|
|               null|
|               null|
|               null|
+-------------------+
only showing top 5 rows



In [61]:
#handling jason data

jsondf = spark.range(1).selectExpr(""" '{"myJsonKey": 
                                                {"myJsonValues": [1, 2, 3]}}' 
                                                    as jsonString """)

In [62]:
from pyspark.sql.functions import get_json_object, json_tuple

jsondf.select(
    get_json_object(col("jsonString"), "$.myJsonKey.myJsonValues[0]").alias("column"), 
    json_tuple(col("jsonString"), "myJsonKey").alias("jsonKey")).show(10, False)

+------+------------------------+
|column|jsonKey                 |
+------+------------------------+
|1     |{"myJsonValues":[1,2,3]}|
+------+------------------------+



In [63]:
from pyspark.sql.functions import to_json

superstore.selectExpr("(`Order ID`, `Customer Name`) as myStruct")\
.select(to_json(col("myStruct"))).show(5, False)

+---------------------------------------------------------------+
|structstojson(myStruct)                                        |
+---------------------------------------------------------------+
|{"Order ID":"CA-2017-152156","Customer Name":"Claire Gute"}    |
|{"Order ID":"CA-2017-152156","Customer Name":"Claire Gute"}    |
|{"Order ID":"CA-2017-138688","Customer Name":"Darrin Van Huff"}|
|{"Order ID":"US-2016-108966","Customer Name":"Sean O'Donnell"} |
|{"Order ID":"US-2016-108966","Customer Name":"Sean O'Donnell"} |
+---------------------------------------------------------------+
only showing top 5 rows

