In [3]:
filesPath = "/home/anil/Anil_Documents/01.02.BID_DATA/PySPark/Spark-The-Definitive-Guide-master/data"

In [4]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [5]:
df_sales = spark.read.format("csv")\
.option("header","true")\
.option("schema","true")\
.load(filesPath + "/retail-data/all/online-retail-dataset.csv")



In [6]:
spark.conf.set("spark.sql.shuffle.partitions",30)

In [7]:
print(df_sales.InvoiceNo)

Column<b'InvoiceNo'>


In [8]:
df_sales.where(expr("InvoiceNo = '554084' AND StockCode = '23298' ")).show(truncate=False)

+---------+---------+-----------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description      |Quantity|InvoiceDate    |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------+--------+---------------+---------+----------+--------------+
|554084   |23298    |BUNTING , SPOTTY |3       |5/22/2011 11:52|4.95     |12909     |United Kingdom|
|554084   |23298    |SPOTTY BUNTING   |3       |5/22/2011 11:52|4.95     |12909     |United Kingdom|
+---------+---------+-----------------+--------+---------------+---------+----------+--------------+



In [7]:
df_sales_agg = df_sales.groupBy("InvoiceNo","StockCode","Description","CustomerID","Country")\
.agg(sum("Quantity").alias("Sum_Quantity"), avg("UnitPrice").alias("Avg_UnitPrice"),max("InvoiceDate").alias("Max_InvDate"))

In [8]:
df_sales_agg.groupBy("InvoiceNo","StockCode").count()\
.where("count > 1")\
.show()

+---------+---------+-----+
|InvoiceNo|StockCode|count|
+---------+---------+-----+
|   575335|    23203|    2|
|   554084|    23298|    2|
+---------+---------+-----+



In [9]:
df_sales_agg.count()

531227

In [10]:
df_sales_agg.explain()

== Physical Plan ==
SortAggregate(key=[InvoiceNo#10, StockCode#11, Description#12, CustomerID#16, Country#17], functions=[sum(cast(Quantity#13 as double)), avg(cast(UnitPrice#15 as double)), max(InvoiceDate#14)])
+- *(2) Sort [InvoiceNo#10 ASC NULLS FIRST, StockCode#11 ASC NULLS FIRST, Description#12 ASC NULLS FIRST, CustomerID#16 ASC NULLS FIRST, Country#17 ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(InvoiceNo#10, StockCode#11, Description#12, CustomerID#16, Country#17, 30)
      +- SortAggregate(key=[InvoiceNo#10, StockCode#11, Description#12, CustomerID#16, Country#17], functions=[partial_sum(cast(Quantity#13 as double)), partial_avg(cast(UnitPrice#15 as double)), partial_max(InvoiceDate#14)])
         +- *(1) Sort [InvoiceNo#10 ASC NULLS FIRST, StockCode#11 ASC NULLS FIRST, Description#12 ASC NULLS FIRST, CustomerID#16 ASC NULLS FIRST, Country#17 ASC NULLS FIRST], false, 0
            +- *(1) FileScan csv [InvoiceNo#10,StockCode#11,Description#12,Quantity#13,InvoiceD

In [11]:
df_sales_agg.rdd.getNumPartitions()

30

In [12]:
df_sales.rdd.getNumPartitions()

2

In [13]:
df_sales.where(expr("InvoiceNo = '554084' AND StockCode = '23298' ")).show(truncate=False)

+---------+---------+-----------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description      |Quantity|InvoiceDate    |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------+--------+---------------+---------+----------+--------------+
|554084   |23298    |BUNTING , SPOTTY |3       |5/22/2011 11:52|4.95     |12909     |United Kingdom|
|554084   |23298    |SPOTTY BUNTING   |3       |5/22/2011 11:52|4.95     |12909     |United Kingdom|
+---------+---------+-----------------+--------+---------------+---------+----------+--------------+



In [14]:
df_sales.where(df_sales.InvoiceNo == '554084').show()

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   554084|    23102|SILVER HEARTS TAB...|      12|5/22/2011 11:52|     0.83|     12909|United Kingdom|
|   554084|    23298|   BUNTING , SPOTTY |       3|5/22/2011 11:52|     4.95|     12909|United Kingdom|
|   554084|    23254|KIDS CUTLERY DOLL...|       4|5/22/2011 11:52|     4.15|     12909|United Kingdom|
|   554084|    22470|HEART OF WICKER L...|       6|5/22/2011 11:52|     2.95|     12909|United Kingdom|
|   554084|    23191|BUNDLE OF 3 RETRO...|      12|5/22/2011 11:52|     1.65|     12909|United Kingdom|
|   554084|    23204|CHARLOTTE BAG APP...|      10|5/22/2011 11:52|     0.85|     12909|United Kingdom|
|   554084|    22469|HEART OF WICKER S...|      12|5/22/2011 11:

In [15]:
df_sales.groupBy(df_sales.StockCode,"Country").agg({"Quantity":"sum","UnitPrice":"max"}).show()

+---------+--------------+--------------+-------------+
|StockCode|       Country|max(UnitPrice)|sum(Quantity)|
+---------+--------------+--------------+-------------+
|    10133|        Canada|          0.85|         40.0|
|    10135|          EIRE|          0.42|         60.0|
|   15044A|United Kingdom|          5.79|        427.0|
|  15056BL|       Finland|          5.95|          3.0|
|   15056N|        France|          5.95|         51.0|
|   15058A|        Norway|          7.95|          6.0|
|   15058B|       Germany|          7.95|          2.0|
|   16156S|       Belgium|          0.42|         25.0|
|    16225|United Kingdom|          3.36|        785.0|
|    16236|       Germany|          0.21|         56.0|
|   16258A|       Germany|          0.42|          4.0|
|    17003|       Germany|          0.39|        180.0|
|   17084R|United Kingdom|          0.21|       2592.0|
|    20654|        Cyprus|          1.25|         12.0|
|    20658|        Cyprus|          1.25|       

In [16]:
df_sales.agg(sum("Quantity")).show()

+-------------+
|sum(Quantity)|
+-------------+
|    5176450.0|
+-------------+



In [17]:
df = spark.createDataFrame([("a", 1), ("b", 2), ("c",  3)], ["Col1", "Col2"])

In [18]:
df.select(df.colRegex("`(Col2)?+.+`")).show()

+----+
|Col1|
+----+
|   a|
|   b|
|   c|
+----+



In [19]:
df.columns

['Col1', 'Col2']

In [11]:
df_sales.createOrReplaceTempView("df_sales")

In [12]:
spark.sql("show databases").show()

+------------+
|databaseName|
+------------+
|   anil_test|
|     default|
|     some_db|
+------------+



In [13]:
spark.sql("show tables").show()

+--------+-------------------+-----------+
|database|          tableName|isTemporary|
+--------+-------------------+-----------+
| default|      bucketedfiles|      false|
| default| flight_from_select|      false|
| default|            flights|      false|
| default|       hive_flights|      false|
| default|     hive_flights_2|      false|
| default|      just_usa_view|      false|
| default|          list_view|      false|
| default|        nested_data|      false|
| default|partitioned_flights|      false|
|        |           df_sales|       true|
+--------+-------------------+-----------+



In [14]:
#spark.catalog.dropGlobalTempView("dfSales")

spark.sql("Select * from df_sales").show(4)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 4 rows



# Person data

In [9]:
person = spark.createDataFrame([
(0, "Bill Chambers", 0, [100]),
(1, "Matei Zaharia", 1, [500, 250, 100]),
(2, "Michael Armbrust", 1, [250, 100])])\
.toDF("id", "name", "graduate_program", "spark_status")


In [10]:
graduateProgram = spark.createDataFrame([
(0, "Masters", "School of Information", "UC Berkeley"),
(2, "Masters", "EECS", "UC Berkeley"),
(1, "Ph.D.", "EECS", "UC Berkeley")])\
.toDF("graduate_program", "degree", "department", "school")


In [11]:
sparkStatus = spark.createDataFrame([
(500, "Vice President"),
(250, "PMC Member"),
(100, "Contributor")])\
.toDF("id", "status")

In [12]:
person.join(graduateProgram, person.graduate_program == graduateProgram.graduate_program).show()

+---+----------------+----------------+---------------+----------------+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status|graduate_program| degree|          department|     school|
+---+----------------+----------------+---------------+----------------+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|               0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|               1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|               1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+---------------+----------------+-------+--------------------+-----------+



In [13]:
#or
person.join(graduateProgram, person["graduate_program"] == graduateProgram["graduate_program"]).show()

+---+----------------+----------------+---------------+----------------+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status|graduate_program| degree|          department|     school|
+---+----------------+----------------+---------------+----------------+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|               0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|               1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|               1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+---------------+----------------+-------+--------------------+-----------+



In [14]:
person.crossJoin(graduateProgram.select("department")).show()

+---+----------------+----------------+---------------+--------------------+
| id|            name|graduate_program|   spark_status|          department|
+---+----------------+----------------+---------------+--------------------+
|  0|   Bill Chambers|               0|          [100]|School of Informa...|
|  0|   Bill Chambers|               0|          [100]|                EECS|
|  0|   Bill Chambers|               0|          [100]|                EECS|
|  1|   Matei Zaharia|               1|[500, 250, 100]|School of Informa...|
|  2|Michael Armbrust|               1|     [250, 100]|School of Informa...|
|  1|   Matei Zaharia|               1|[500, 250, 100]|                EECS|
|  1|   Matei Zaharia|               1|[500, 250, 100]|                EECS|
|  2|Michael Armbrust|               1|     [250, 100]|                EECS|
|  2|Michael Armbrust|               1|     [250, 100]|                EECS|
+---+----------------+----------------+---------------+--------------------+

In [45]:
 person.join(graduateProgram.select("department","graduate_program"), person["graduate_program"] == graduateProgram["graduate_program"]).show()

+---+----------------+----------------+---------------+--------------------+----------------+
| id|            name|graduate_program|   spark_status|          department|graduate_program|
+---+----------------+----------------+---------------+--------------------+----------------+
|  0|   Bill Chambers|               0|          [100]|School of Informa...|               0|
|  1|   Matei Zaharia|               1|[500, 250, 100]|                EECS|               1|
|  2|Michael Armbrust|               1|     [250, 100]|                EECS|               1|
+---+----------------+----------------+---------------+--------------------+----------------+



In [53]:
df_sales.select("CustomerID","Country")\
.crosstab("CustomerID","Country").show()

+------------------+---------+-------+-------+-------+------+------+---------------+------+--------------+-------+----+------------------+-------+------+-------+------+---------+-------+------+-----+-----+-------+---------+-----+-----------+------+------+--------+---+------------+---------+-----+------+-----------+---+--------------------+--------------+-----------+
|CustomerID_Country|Australia|Austria|Bahrain|Belgium|Brazil|Canada|Channel Islands|Cyprus|Czech Republic|Denmark|EIRE|European Community|Finland|France|Germany|Greece|Hong Kong|Iceland|Israel|Italy|Japan|Lebanon|Lithuania|Malta|Netherlands|Norway|Poland|Portugal|RSA|Saudi Arabia|Singapore|Spain|Sweden|Switzerland|USA|United Arab Emirates|United Kingdom|Unspecified|
+------------------+---------+-------+-------+-------+------+------+---------------+------+--------------+-------+----+------------------+-------+------+-------+------+---------+-------+------+-----+-----+-------+---------+-----+-----------+------+------+-------

In [59]:
df_sales.cube("CustomerID","Country").agg(sum("Quantity")).orderBy(desc_nulls_last("Country")).show(20)

+----------+--------------+-------------+
|CustomerID|       Country|sum(Quantity)|
+----------+--------------+-------------+
|     12363|   Unspecified|        408.0|
|     14265|   Unspecified|        330.0|
|     12743|   Unspecified|        319.0|
|     16320|   Unspecified|        732.0|
|      null|   Unspecified|       1511.0|
|      null|   Unspecified|       3300.0|
|     14798|United Kingdom|        150.0|
|     13158|United Kingdom|        784.0|
|     14779|United Kingdom|         84.0|
|     13004|United Kingdom|       4796.0|
|     16618|United Kingdom|       1362.0|
|     16904|United Kingdom|       2034.0|
|     17094|United Kingdom|        148.0|
|     14048|United Kingdom|        274.0|
|     17262|United Kingdom|        367.0|
|     18277|United Kingdom|         67.0|
|     17978|United Kingdom|        499.0|
|     14659|United Kingdom|       1183.0|
|     18016|United Kingdom|       1029.0|
|     13939|United Kingdom|        843.0|
+----------+--------------+-------

In [62]:
df_sales.describe("Quantity").show()

+-------+------------------+
|summary|          Quantity|
+-------+------------------+
|  count|            541909|
|   mean|  9.55224954743324|
| stddev|218.08115785023486|
|    min|                -1|
|    max|               992|
+-------+------------------+



In [16]:
person.distinct().show()

+---+----------------+----------------+---------------+
| id|            name|graduate_program|   spark_status|
+---+----------------+----------------+---------------+
|  2|Michael Armbrust|               1|     [250, 100]|
|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  0|   Bill Chambers|               0|          [100]|
+---+----------------+----------------+---------------+



In [19]:
person.join(graduateProgram, person.graduate_program == graduateProgram.graduate_program)\
.drop(person.id)\
.show()

+----------------+----------------+---------------+----------------+-------+--------------------+-----------+
|            name|graduate_program|   spark_status|graduate_program| degree|          department|     school|
+----------------+----------------+---------------+----------------+-------+--------------------+-----------+
|   Bill Chambers|               0|          [100]|               0|Masters|School of Informa...|UC Berkeley|
|   Matei Zaharia|               1|[500, 250, 100]|               1|  Ph.D.|                EECS|UC Berkeley|
|Michael Armbrust|               1|     [250, 100]|               1|  Ph.D.|                EECS|UC Berkeley|
+----------------+----------------+---------------+----------------+-------+--------------------+-----------+



In [24]:
graduateProgram\
.drop_duplicates(["degree","school"])\
.show()

+----------------+-------+--------------------+-----------+
|graduate_program| degree|          department|     school|
+----------------+-------+--------------------+-----------+
|               1|  Ph.D.|                EECS|UC Berkeley|
|               0|Masters|School of Informa...|UC Berkeley|
+----------------+-------+--------------------+-----------+



In [26]:
df_sales.dropna().count()

406829

In [27]:
df_sales.count()

541909

In [29]:
df_sales.dropna("any").count()

406829

In [30]:
df_sales.dropna("all").count()

541909

In [37]:
df_sales.dropna(subset=['CustomerID']).count()

406829

In [38]:
df_sales.dtypes

[('InvoiceNo', 'string'),
 ('StockCode', 'string'),
 ('Description', 'string'),
 ('Quantity', 'string'),
 ('InvoiceDate', 'string'),
 ('UnitPrice', 'string'),
 ('CustomerID', 'string'),
 ('Country', 'string')]

In [39]:
df1 = spark.createDataFrame([("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b",  3), ("c", 4)], ["C1", "C2"])
df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])

In [41]:
df1.exceptAll(df2).show()

+---+---+
| C1| C2|
+---+---+
|  a|  1|
|  a|  1|
|  c|  4|
|  a|  2|
+---+---+



In [47]:
df_sales.where(isnull("CustomerID"))\
.fillna('NA')\
.show(10)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536414|    22139|                  NA|      56|12/1/2010 11:52|        0|        NA|United Kingdom|
|   536544|    21773|DECORATIVE ROSE B...|       1|12/1/2010 14:32|     2.51|        NA|United Kingdom|
|   536544|    21774|DECORATIVE CATS B...|       2|12/1/2010 14:32|     2.51|        NA|United Kingdom|
|   536544|    21786|  POLKADOT RAIN HAT |       4|12/1/2010 14:32|     0.85|        NA|United Kingdom|
|   536544|    21787|RAIN PONCHO RETRO...|       2|12/1/2010 14:32|     1.66|        NA|United Kingdom|
|   536544|    21790|  VINTAGE SNAP CARDS|       9|12/1/2010 14:32|     1.66|        NA|United Kingdom|
|   536544|    21791|VINTAGE HEADS AND...|       2|12/1/2010 14:

In [49]:
df_sales.where(isnull("CustomerID"))\
.fillna({'CustomerID' : 'NotAVailable'})\
.show(10)

+---------+---------+--------------------+--------+---------------+---------+------------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|  CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+------------+--------------+
|   536414|    22139|                null|      56|12/1/2010 11:52|        0|NotAVailable|United Kingdom|
|   536544|    21773|DECORATIVE ROSE B...|       1|12/1/2010 14:32|     2.51|NotAVailable|United Kingdom|
|   536544|    21774|DECORATIVE CATS B...|       2|12/1/2010 14:32|     2.51|NotAVailable|United Kingdom|
|   536544|    21786|  POLKADOT RAIN HAT |       4|12/1/2010 14:32|     0.85|NotAVailable|United Kingdom|
|   536544|    21787|RAIN PONCHO RETRO...|       2|12/1/2010 14:32|     1.66|NotAVailable|United Kingdom|
|   536544|    21790|  VINTAGE SNAP CARDS|       9|12/1/2010 14:32|     1.66|NotAVailable|United Kingdom|
|   536544|    21791|VINTAGE HEADS AND...|    

In [52]:
df_sales.filter(df_sales.Quantity > 500).show()

+---------+---------+--------------------+--------+----------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|     InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+----------------+---------+----------+--------------+
|   536437|    17021|NAMASTE SWAGAT IN...|     600| 12/1/2010 12:12|     0.24|     13694|United Kingdom|
|   536736|    22616|PACK OF 12 LONDON...|     600| 12/2/2010 12:59|     0.29|     17381|United Kingdom|
|   536809|    84950|ASSORTED COLOUR T...|    1824| 12/2/2010 16:48|     0.55|     15299|United Kingdom|
|   536830|    84077|WORLD WAR 2 GLIDE...|    2880| 12/2/2010 17:38|     0.18|     16754|United Kingdom|
|   536830|    21915|RED  HARMONICA IN...|    1400| 12/2/2010 17:38|     1.06|     16754|United Kingdom|
|   536890|   17084R|ASSORTED INCENSE ...|    1440| 12/3/2010 11:48|     0.16|     14156|          EIRE|
|   537214|    17003| BROCADE RING PURSE |     720| 12/

In [54]:
df_sales.first()

Row(InvoiceNo='536365', StockCode='85123A', Description='WHITE HANGING HEART T-LIGHT HOLDER', Quantity='6', InvoiceDate='12/1/2010 8:26', UnitPrice='2.55', CustomerID='17850', Country='United Kingdom')

In [57]:
df_sales.orderBy(desc_nulls_last("Quantity")).first()

Row(InvoiceNo='574293', StockCode='85123A', Description='WHITE HANGING HEART T-LIGHT HOLDER', Quantity='992', InvoiceDate='11/3/2011 15:32', UnitPrice='3.2', CustomerID='17450', Country='United Kingdom')

In [59]:
def f(person):
    print(person.name)

In [60]:
person.foreach(f)

In [61]:
def f(people):
    for person in people:
        print(person.name)

In [62]:
person.foreachPartition(f)

In [77]:
df_sales.freqItems(['Country','StockCode'],0.01).show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Country_freqItems                                                                                                                                                                                                                                                                                                                                               

In [82]:
df_sales.groupBy("StockCode").agg({'Quantity':'sum','UnitPrice':'max'}).show()

+---------+--------------+-------------+
|StockCode|max(UnitPrice)|sum(Quantity)|
+---------+--------------+-------------+
|    16016|          0.85|       1273.0|
|   16162L|          0.65|         -4.0|
|   18094C|          2.51|        125.0|
|   18097C|          5.06|        407.0|
|    20617|          4.21|         93.0|
|    20661|          5.79|         44.0|
|    20664|          2.95|         41.0|
|    20671|          1.25|        -12.0|
|    20821|          4.95|         14.0|
|    20826|          2.12|        107.0|
|    20839|          9.32|        495.0|
|    20901|          6.35|         18.0|
|    20903|          6.35|         49.0|
|    20966|          2.51|        526.0|
|    20975|          1.28|       6316.0|
|    21027|          2.51|        153.0|
|    21042|          5.95|        329.0|
|    21112|          5.91|       1359.0|
|    21215|          2.13|       1089.0|
|    21219|          2.51|        448.0|
+---------+--------------+-------------+
only showing top

In [83]:
df_sales.head(10)

[Row(InvoiceNo='536365', StockCode='85123A', Description='WHITE HANGING HEART T-LIGHT HOLDER', Quantity='6', InvoiceDate='12/1/2010 8:26', UnitPrice='2.55', CustomerID='17850', Country='United Kingdom'),
 Row(InvoiceNo='536365', StockCode='71053', Description='WHITE METAL LANTERN', Quantity='6', InvoiceDate='12/1/2010 8:26', UnitPrice='3.39', CustomerID='17850', Country='United Kingdom'),
 Row(InvoiceNo='536365', StockCode='84406B', Description='CREAM CUPID HEARTS COAT HANGER', Quantity='8', InvoiceDate='12/1/2010 8:26', UnitPrice='2.75', CustomerID='17850', Country='United Kingdom'),
 Row(InvoiceNo='536365', StockCode='84029G', Description='KNITTED UNION FLAG HOT WATER BOTTLE', Quantity='6', InvoiceDate='12/1/2010 8:26', UnitPrice='3.39', CustomerID='17850', Country='United Kingdom'),
 Row(InvoiceNo='536365', StockCode='84029E', Description='RED WOOLLY HOTTIE WHITE HEART.', Quantity='6', InvoiceDate='12/1/2010 8:26', UnitPrice='3.39', CustomerID='17850', Country='United Kingdom'),
 Ro

In [88]:
person.crossJoin(graduateProgram.hint("Broadcast")).explain()

== Physical Plan ==
BroadcastNestedLoopJoin BuildRight, Cross
:- *(1) Project [_1#59L AS id#67L, _2#60 AS name#68, _3#61L AS graduate_program#69L, _4#62 AS spark_status#70]
:  +- Scan ExistingRDD[_1#59L,_2#60,_3#61L,_4#62]
+- BroadcastExchange IdentityBroadcastMode
   +- *(2) Project [_1#75L AS graduate_program#83L, _2#76 AS degree#84, _3#77 AS department#85, _4#78 AS school#86]
      +- Scan ExistingRDD[_1#75L,_2#76,_3#77,_4#78]


In [89]:
df1 = spark.createDataFrame([("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b",  3), ("c", 4)], ["C1", "C2"])
df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])

In [91]:
df1.intersect(df2).show()

+---+---+
| C1| C2|
+---+---+
|  b|  3|
|  a|  1|
+---+---+



In [92]:
df1.intersectAll(df2).show()

+---+---+
| C1| C2|
+---+---+
|  b|  3|
|  a|  1|
+---+---+



In [273]:
df1.subtract(df2).show()

+---+---+
| C1| C2|
+---+---+
|  c|  4|
|  a|  2|
+---+---+



In [287]:
df1.unionAll(df2).show()

+---+---+
| C1| C2|
+---+---+
|  a|  1|
|  a|  1|
|  a|  1|
|  a|  2|
|  b|  3|
|  c|  4|
|  a|  1|
|  b|  3|
+---+---+



In [288]:
df1.union(df2).show()

+---+---+
| C1| C2|
+---+---+
|  a|  1|
|  a|  1|
|  a|  1|
|  a|  2|
|  b|  3|
|  c|  4|
|  a|  1|
|  b|  3|
+---+---+



In [289]:
df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
df1.unionByName(df2).show()

#Column position is different in both dataframes

+----+----+----+
|col0|col1|col2|
+----+----+----+
|   1|   2|   3|
|   6|   4|   5|
+----+----+----+



In [95]:
person.isLocal()

False

In [122]:
df_sales.select("InvoiceDate").show()

+--------------+
|   InvoiceDate|
+--------------+
|12/1/2010 8:26|
|12/1/2010 8:26|
|12/1/2010 8:26|
|12/1/2010 8:26|
|12/1/2010 8:26|
|12/1/2010 8:26|
|12/1/2010 8:26|
|12/1/2010 8:28|
|12/1/2010 8:28|
|12/1/2010 8:34|
|12/1/2010 8:34|
|12/1/2010 8:34|
|12/1/2010 8:34|
|12/1/2010 8:34|
|12/1/2010 8:34|
|12/1/2010 8:34|
|12/1/2010 8:34|
|12/1/2010 8:34|
|12/1/2010 8:34|
|12/1/2010 8:34|
+--------------+
only showing top 20 rows



In [134]:
df_sales.select(year(to_date("InvoiceDate",'MM/d/yyyy H:mm'))).show()

+----------------------------------------------+
|year(to_date(`InvoiceDate`, 'MM/d/yyyy H:mm'))|
+----------------------------------------------+
|                                          2010|
|                                          2010|
|                                          2010|
|                                          2010|
|                                          2010|
|                                          2010|
|                                          2010|
|                                          2010|
|                                          2010|
|                                          2010|
|                                          2010|
|                                          2010|
|                                          2010|
|                                          2010|
|                                          2010|
|                                          2010|
|                                          2010|
|                   

In [135]:
df_s1 = df_sales.groupBy("Country","StockCode",year(to_date("InvoiceDate",'MM/d/yyyy H:mm')).alias("INV_YEAR")).agg({'Quantity':'sum','UnitPrice':'max'})\
.withColumnRenamed("max(UnitPrice)","Max_UP")\
.withColumnRenamed("sum(Quantity)","Tot_Qty")

In [136]:
df_s2 = df_sales.groupBy("Country","StockCode").agg({'Quantity':'sum','UnitPrice':'max'})\
.withColumnRenamed("max(UnitPrice)","Max_UP")\
.withColumnRenamed("sum(Quantity)","Tot_Qty")

In [140]:
df_s2.count()

19839

In [144]:
join_s1_s2 = [df_s1['Country']==df_s2['Country'],df_s1['StockCode']==df_s2['StockCode'],df_s1['Tot_Qty']< df_s2['Tot_Qty'] ]

In [145]:
df_s1.join(df_s2,join_s1_s2,'INNER').show()

+---------------+---------+--------+------+-------+---------------+---------+------+-------+
|        Country|StockCode|INV_YEAR|Max_UP|Tot_Qty|        Country|StockCode|Max_UP|Tot_Qty|
+---------------+---------+--------+------+-------+---------------+---------+------+-------+
|      Australia|    22193|    2010|   8.5|    2.0|      Australia|    22193|   8.5|   22.0|
|      Australia|    22193|    2011|   8.5|   20.0|      Australia|    22193|   8.5|   22.0|
|      Australia|    22727|    2010|  3.75|    4.0|      Australia|    22727|  3.75|   78.0|
|      Australia|    22727|    2011|  3.75|   74.0|      Australia|    22727|  3.75|   78.0|
|        Belgium|    21122|    2010|  1.25|   24.0|        Belgium|    21122|  1.25|   48.0|
|        Belgium|    21122|    2011|  1.25|   24.0|        Belgium|    21122|  1.25|   48.0|
|        Belgium|    21976|    2011|  0.55|  264.0|        Belgium|    21976|  0.55|  312.0|
|        Belgium|    21976|    2010|  0.55|   48.0|        Belgium|   

In [149]:
df_s1.join(df_s2,['Country','StockCode'],'inner').where(df_s1.Tot_Qty < df_s2.Tot_Qty).show()

+---------------+---------+--------+------+-------+------+-------+
|        Country|StockCode|INV_YEAR|Max_UP|Tot_Qty|Max_UP|Tot_Qty|
+---------------+---------+--------+------+-------+------+-------+
|      Australia|    22193|    2010|   8.5|    2.0|   8.5|   22.0|
|      Australia|    22193|    2011|   8.5|   20.0|   8.5|   22.0|
|      Australia|    22727|    2010|  3.75|    4.0|  3.75|   78.0|
|      Australia|    22727|    2011|  3.75|   74.0|  3.75|   78.0|
|        Belgium|    21122|    2010|  1.25|   24.0|  1.25|   48.0|
|        Belgium|    21122|    2011|  1.25|   24.0|  1.25|   48.0|
|        Belgium|    21976|    2011|  0.55|  264.0|  0.55|  312.0|
|        Belgium|    21976|    2010|  0.55|   48.0|  0.55|  312.0|
|Channel Islands|    22688|    2010|  7.95|    2.0|  8.25|    4.0|
|Channel Islands|    22688|    2011|  8.25|    2.0|  8.25|    4.0|
|        Denmark|    22326|    2010|  2.95|   12.0|  2.95|   54.0|
|        Denmark|    22326|    2011|  2.95|   42.0|  2.95|   5

In [150]:
df_s1.join(df_s2,['Country','StockCode']).where(df_s1.Tot_Qty < df_s2.Tot_Qty).show()

+---------------+---------+--------+------+-------+------+-------+
|        Country|StockCode|INV_YEAR|Max_UP|Tot_Qty|Max_UP|Tot_Qty|
+---------------+---------+--------+------+-------+------+-------+
|      Australia|    22193|    2010|   8.5|    2.0|   8.5|   22.0|
|      Australia|    22193|    2011|   8.5|   20.0|   8.5|   22.0|
|      Australia|    22727|    2010|  3.75|    4.0|  3.75|   78.0|
|      Australia|    22727|    2011|  3.75|   74.0|  3.75|   78.0|
|        Belgium|    21122|    2010|  1.25|   24.0|  1.25|   48.0|
|        Belgium|    21122|    2011|  1.25|   24.0|  1.25|   48.0|
|        Belgium|    21976|    2011|  0.55|  264.0|  0.55|  312.0|
|        Belgium|    21976|    2010|  0.55|   48.0|  0.55|  312.0|
|Channel Islands|    22688|    2010|  7.95|    2.0|  8.25|    4.0|
|Channel Islands|    22688|    2011|  8.25|    2.0|  8.25|    4.0|
|        Denmark|    22326|    2010|  2.95|   12.0|  2.95|   54.0|
|        Denmark|    22326|    2011|  2.95|   42.0|  2.95|   5

In [152]:
df_s1.sort("Country").show()

+---------+---------+--------+------+-------+
|  Country|StockCode|INV_YEAR|Max_UP|Tot_Qty|
+---------+---------+--------+------+-------+
|Australia|    20717|    2011|  1.25|   10.0|
|Australia|    21194|    2011|  0.65|   12.0|
|Australia|    21198|    2011|  1.65|   48.0|
|Australia|    21876|    2011|  1.25|   24.0|
|Australia|    21922|    2011|  7.95|    2.0|
|Australia|    22045|    2011|  0.42|  425.0|
|Australia|    22079|    2011|  1.45|  120.0|
|Australia|    22333|    2011|  1.65|  320.0|
|Australia|    22429|    2011|  4.25|   16.0|
|Australia|    22539|    2011|  0.36|  480.0|
|Australia|    22576|    2011|  0.85|   32.0|
|Australia|    22584|    2011|  2.55|   36.0|
|Australia|    22629|    2011|  1.65|  960.0|
|Australia|    22725|    2011|  3.39|   50.0|
|Australia|    22946|    2011| 12.75|   60.0|
|Australia|    22980|    2011|  1.45|  240.0|
|Australia|    23100|    2011|  1.25|   12.0|
|Australia|    23297|    2011|  1.65|  264.0|
|Australia|    23325|    2011|  1.

In [155]:
df_s1.sort(col("Max_UP").desc()).show()

+--------------+------------+--------+------+-------+
|       Country|   StockCode|INV_YEAR|Max_UP|Tot_Qty|
+--------------+------------+--------+------+-------+
|United Kingdom|       23064|    2011| 99.96|  138.0|
|United Kingdom|       21473|    2011| 99.96|   26.0|
|United Kingdom|         DOT|    2011|   988| 1653.0|
|United Kingdom|        CRUK|    2011|987.14|  -16.0|
|United Kingdom|           S|    2011| 98.18|  -57.0|
|United Kingdom|           M|    2011| 97.35| 3471.0|
|United Kingdom|           D|    2011| 97.14|-1183.0|
|United Kingdom|BANK CHARGES|    2010|966.92|   -1.0|
|United Kingdom|         DOT|    2010|950.99|   54.0|
|United Kingdom|           M|    2010|924.59|  191.0|
|United Kingdom|      90124C|    2011|  9.98|   11.0|
|United Kingdom|      90180B|    2011|  9.98|    5.0|
|United Kingdom|      90012A|    2011|  9.98|   -8.0|
|United Kingdom|       22839|    2011|  9.98|  270.0|
|United Kingdom|       22838|    2011|  9.98|  226.0|
|United Kingdom|      90124A

In [159]:
df_s1.sort(col("Max_UP"),ascending = False).show()

+--------------+------------+--------+------+-------+
|       Country|   StockCode|INV_YEAR|Max_UP|Tot_Qty|
+--------------+------------+--------+------+-------+
|United Kingdom|       23064|    2011| 99.96|  138.0|
|United Kingdom|       21473|    2011| 99.96|   26.0|
|United Kingdom|         DOT|    2011|   988| 1653.0|
|United Kingdom|        CRUK|    2011|987.14|  -16.0|
|United Kingdom|           S|    2011| 98.18|  -57.0|
|United Kingdom|           M|    2011| 97.35| 3471.0|
|United Kingdom|           D|    2011| 97.14|-1183.0|
|United Kingdom|BANK CHARGES|    2010|966.92|   -1.0|
|United Kingdom|         DOT|    2010|950.99|   54.0|
|United Kingdom|           M|    2010|924.59|  191.0|
|United Kingdom|      90124C|    2011|  9.98|   11.0|
|United Kingdom|       22839|    2011|  9.98|  270.0|
|United Kingdom|      90180A|    2011|  9.98|  -16.0|
|United Kingdom|      90180B|    2011|  9.98|    5.0|
|United Kingdom|       22838|    2011|  9.98|  226.0|
|United Kingdom|      90124A

In [160]:
df_s1.sort(desc("Max_UP")).show()

+--------------+------------+--------+------+-------+
|       Country|   StockCode|INV_YEAR|Max_UP|Tot_Qty|
+--------------+------------+--------+------+-------+
|United Kingdom|       23064|    2011| 99.96|  138.0|
|United Kingdom|       21473|    2011| 99.96|   26.0|
|United Kingdom|         DOT|    2011|   988| 1653.0|
|United Kingdom|        CRUK|    2011|987.14|  -16.0|
|United Kingdom|           S|    2011| 98.18|  -57.0|
|United Kingdom|           M|    2011| 97.35| 3471.0|
|United Kingdom|           D|    2011| 97.14|-1183.0|
|United Kingdom|BANK CHARGES|    2010|966.92|   -1.0|
|United Kingdom|         DOT|    2010|950.99|   54.0|
|United Kingdom|           M|    2010|924.59|  191.0|
|United Kingdom|      90124C|    2011|  9.98|   11.0|
|United Kingdom|       22839|    2011|  9.98|  270.0|
|United Kingdom|      90012A|    2011|  9.98|   -8.0|
|United Kingdom|       22838|    2011|  9.98|  226.0|
|United Kingdom|      90180A|    2011|  9.98|  -16.0|
|United Kingdom|      90180B

In [164]:
df_s1.orderBy(["Max_UP", "Country"], ascending=[0, 1]).show()

+--------------+------------+--------+------+-------+
|       Country|   StockCode|INV_YEAR|Max_UP|Tot_Qty|
+--------------+------------+--------+------+-------+
|United Kingdom|       23064|    2011| 99.96|  138.0|
|United Kingdom|       21473|    2011| 99.96|   26.0|
|United Kingdom|         DOT|    2011|   988| 1653.0|
|United Kingdom|        CRUK|    2011|987.14|  -16.0|
|United Kingdom|           S|    2011| 98.18|  -57.0|
|United Kingdom|           M|    2011| 97.35| 3471.0|
|United Kingdom|           D|    2011| 97.14|-1183.0|
|United Kingdom|BANK CHARGES|    2010|966.92|   -1.0|
|United Kingdom|         DOT|    2010|950.99|   54.0|
|United Kingdom|           M|    2010|924.59|  191.0|
|United Kingdom|      90124C|    2011|  9.98|   11.0|
|United Kingdom|      90180B|    2011|  9.98|    5.0|
|United Kingdom|      90012A|    2011|  9.98|   -8.0|
|United Kingdom|       22838|    2011|  9.98|  226.0|
|United Kingdom|       22839|    2011|  9.98|  270.0|
|United Kingdom|      90180A

In [179]:
df_s1.cache().storageLevel

StorageLevel(True, True, False, True, 1)

In [180]:
df_s2.persist().storageLevel

StorageLevel(True, True, False, False, 1)

In [184]:
df_s2.persist(StorageLevel.DISK_ONLY).storageLevel



StorageLevel(True, True, False, False, 1)

In [194]:
from pyspark import *

rdd1 = spark.sparkContext.parallelize([1,2])
rdd1.persist( StorageLevel.OFF_HEAP )
rdd1.getStorageLevel()
print(rdd1.getStorageLevel())

Disk Memory OffHeap Serialized 1x Replicated


In [224]:
df_s1_split = df_s1.randomSplit([0.5,2.0,2.5],seed=20)

In [225]:
df_s1_split[0].count()


2307

In [226]:
df_s1_split[1].count()


9481

In [227]:
df_s1_split[2].count()

12070

In [228]:
df_s2.registerTempTable("df_s2")

In [231]:
df_s2_fromTab = spark.sql("Select * from df_s2")

In [232]:
sorted(df_s2.collect()) == sorted(df_s2_fromTab.collect())

True

In [236]:
df_s1.storageLevel

StorageLevel(True, True, False, True, 1)

In [241]:
df_sales.where(isnull("CustomerID"))\
.na.replace("United Kingdom",None)\
.show(10)

+---------+---------+--------------------+--------+---------------+---------+----------+-------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+--------------------+--------+---------------+---------+----------+-------+
|   536414|    22139|                null|      56|12/1/2010 11:52|        0|      null|   null|
|   536544|    21773|DECORATIVE ROSE B...|       1|12/1/2010 14:32|     2.51|      null|   null|
|   536544|    21774|DECORATIVE CATS B...|       2|12/1/2010 14:32|     2.51|      null|   null|
|   536544|    21786|  POLKADOT RAIN HAT |       4|12/1/2010 14:32|     0.85|      null|   null|
|   536544|    21787|RAIN PONCHO RETRO...|       2|12/1/2010 14:32|     1.66|      null|   null|
|   536544|    21790|  VINTAGE SNAP CARDS|       9|12/1/2010 14:32|     1.66|      null|   null|
|   536544|    21791|VINTAGE HEADS AND...|       2|12/1/2010 14:32|     2.51|      null|   null|
|   536544|    21801|CHRISTMAS

In [247]:
df_sales.where(isnull("CustomerID"))\
.replace(["United Kingdom","France"],["UK","FR"],"Country")\
.select("Country").distinct().show()

+-----------+
|    Country|
+-----------+
|       EIRE|
|         FR|
|Unspecified|
|Switzerland|
|   Portugal|
|  Hong Kong|
|         UK|
|     Israel|
|    Bahrain|
+-----------+



In [250]:
df_sales.schema

StructType(List(StructField(InvoiceNo,StringType,true),StructField(StockCode,StringType,true),StructField(Description,StringType,true),StructField(Quantity,StringType,true),StructField(InvoiceDate,StringType,true),StructField(UnitPrice,StringType,true),StructField(CustomerID,StringType,true),StructField(Country,StringType,true)))

In [261]:
df_sales.persist(StorageLevel.DISK_ONLY_2).storageLevel

StorageLevel(True, True, False, False, 2)

In [260]:
df_sales.persist(StorageLevel.MEMORY_AND_DISK_2).storageLevel

StorageLevel(True, True, False, False, 2)

In [259]:
df_sales.unpersist()

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: string, InvoiceDate: string, UnitPrice: string, CustomerID: string, Country: string]

In [267]:
df_sales.cache().storageLevel

StorageLevel(True, True, False, True, 1)

In [268]:
df_sales.storageLevel

StorageLevel(True, True, False, True, 1)

In [269]:
df_sales.unpersist()

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: string, InvoiceDate: string, UnitPrice: string, CustomerID: string, Country: string]

In [275]:
df_s1.summary().show()

+-------+-----------+------------------+-------------------+------------------+------------------+
|summary|    Country|         StockCode|           INV_YEAR|            Max_UP|           Tot_Qty|
+-------+-----------+------------------+-------------------+------------------+------------------+
|  count|      23858|             23858|              23858|             23858|             23858|
|   mean|       null|28092.145126457744| 2010.8084499958086| 5.454532274289545|216.96915080895297|
| stddev|       null|17613.811805484405|0.39352902206015156|104.92960870796857|1089.6305980207896|
|    min|  Australia|             10002|               2010|                 0|          -14468.0|
|    25%|       null|           21908.0|               2011|              1.25|               7.0|
|    50%|       null|           22582.0|               2011|               2.1|              22.0|
|    75%|       null|           23172.0|               2011|              4.25|              73.0|
|    max|U

In [276]:
df_s1.describe().show()

+-------+-----------+------------------+-------------------+------------------+------------------+
|summary|    Country|         StockCode|           INV_YEAR|            Max_UP|           Tot_Qty|
+-------+-----------+------------------+-------------------+------------------+------------------+
|  count|      23858|             23858|              23858|             23858|             23858|
|   mean|       null|28092.145126457744| 2010.8084499958086| 5.454532274289545|216.96915080895297|
| stddev|       null|17613.811805484405|0.39352902206015156|104.92960870796857|1089.6305980207896|
|    min|  Australia|             10002|               2010|                 0|          -14468.0|
|    max|Unspecified|                 m|               2011|             99.96|           50459.0|
+-------+-----------+------------------+-------------------+------------------+------------------+



In [280]:
df_s1.select("Country").summary("count","max")\
.show()

+-------+-----------+
|summary|    Country|
+-------+-----------+
|  count|      23858|
|    max|Unspecified|
+-------+-----------+



In [282]:
df_s1.toJSON().first()

'{"Country":"Australia","StockCode":"20837","INV_YEAR":2011,"Max_UP":"0.83","Tot_Qty":12.0}'

In [284]:
def f_iter(x):
    for i in x:
        print(i)

In [285]:
f_iter( person.toLocalIterator())

Row(id=0, name='Bill Chambers', graduate_program=0, spark_status=[100])
Row(id=1, name='Matei Zaharia', graduate_program=1, spark_status=[500, 250, 100])
Row(id=2, name='Michael Armbrust', graduate_program=1, spark_status=[250, 100])


In [291]:
df_s1.groupBy("Country").agg({"*":'count'}).show()

+--------------------+--------+
|             Country|count(1)|
+--------------------+--------+
|                EIRE|    2235|
|                 USA|     163|
|             Belgium|     841|
|                 RSA|      58|
|               Spain|    1147|
|         Unspecified|     344|
|           Australia|     621|
|               Italy|     483|
|         Netherlands|     843|
|         Switzerland|    1009|
|             Austria|     310|
|              Canada|     147|
|              Cyprus|     526|
|           Lithuania|      29|
|        Saudi Arabia|       9|
|United Arab Emirates|      68|
|     Channel Islands|     436|
|               Japan|     252|
|            Portugal|     773|
|           Singapore|     178|
+--------------------+--------+
only showing top 20 rows



In [295]:
df_s1.groupBy("Country").agg(min(df_s1.Tot_Qty)).show()

+--------------------+------------+
|             Country|min(Tot_Qty)|
+--------------------+------------+
|                EIRE|       -12.0|
|                 USA|         0.0|
|             Belgium|        -3.0|
|                 RSA|         1.0|
|               Spain|        -3.0|
|         Unspecified|         1.0|
|           Australia|        -7.0|
|               Italy|        -9.0|
|         Netherlands|      -480.0|
|         Switzerland|         0.0|
|             Austria|       -48.0|
|              Canada|         1.0|
|              Cyprus|         0.0|
|           Lithuania|         6.0|
|        Saudi Arabia|         1.0|
|United Arab Emirates|         1.0|
|     Channel Islands|         0.0|
|               Japan|      -618.0|
|            Portugal|        -6.0|
|           Singapore|         0.0|
+--------------------+------------+
only showing top 20 rows



In [306]:
df_s1.groupBy().avg("TOT_QTY").show()
df_s1.groupBy().max("TOT_QTY").show()
df_s1.groupBy().min("TOT_QTY").show()

+------------------+
|      avg(TOT_QTY)|
+------------------+
|216.96915080895297|
+------------------+

+------------+
|max(TOT_QTY)|
+------------+
|     50459.0|
+------------+

+------------+
|min(TOT_QTY)|
+------------+
|    -14468.0|
+------------+



In [303]:
df_s1.groupBy("Country").count().show()

+--------------------+-----+
|             Country|count|
+--------------------+-----+
|                EIRE| 2235|
|                 USA|  163|
|             Belgium|  841|
|                 RSA|   58|
|               Spain| 1147|
|         Unspecified|  344|
|           Australia|  621|
|               Italy|  483|
|         Netherlands|  843|
|         Switzerland| 1009|
|             Austria|  310|
|              Canada|  147|
|              Cyprus|  526|
|           Lithuania|   29|
|        Saudi Arabia|    9|
|United Arab Emirates|   68|
|     Channel Islands|  436|
|               Japan|  252|
|            Portugal|  773|
|           Singapore|  178|
+--------------------+-----+
only showing top 20 rows



In [310]:
df_s1.groupBy("StockCode").pivot("Country").sum("TOT_QTY").show(10)

+---------+---------+-------+-------+-------+------+------+---------------+------+--------------+-------+-----+------------------+-------+------+-------+------+---------+-------+------+-----+-----+-------+---------+-----+-----------+------+------+--------+----+------------+---------+-----+------+-----------+----+--------------------+--------------+-----------+
|StockCode|Australia|Austria|Bahrain|Belgium|Brazil|Canada|Channel Islands|Cyprus|Czech Republic|Denmark| EIRE|European Community|Finland|France|Germany|Greece|Hong Kong|Iceland|Israel|Italy|Japan|Lebanon|Lithuania|Malta|Netherlands|Norway|Poland|Portugal| RSA|Saudi Arabia|Singapore|Spain|Sweden|Switzerland| USA|United Arab Emirates|United Kingdom|Unspecified|
+---------+---------+-------+-------+-------+------+------+---------------+------+--------------+-------+-----+------------------+-------+------+-------+------+---------+-------+------+-----+-----+-------+---------+-----+-----------+------+------+--------+----+------------+

In [309]:
df_s1.groupBy("StockCode").pivot("Country",['Australia','France']).sum("TOT_QTY").show(10)

+---------+---------+------+
|StockCode|Australia|France|
+---------+---------+------+
|    21249|     96.0|  54.0|
|    21932|     15.0|  10.0|
|   79191D|     null|  null|
|    21523|     null|   2.0|
|    21292|     null|  null|
|    21989|     96.0| 264.0|
|    22783|     null|  null|
|    22813|     36.0|  12.0|
|    23484|     null|  null|
|    23264|     null|  36.0|
+---------+---------+------+
only showing top 10 rows



In [320]:
df_s1.orderBy(desc("Country"),col("Tot_Qty").desc()).show()

+-----------+---------+--------+------+-------+
|    Country|StockCode|INV_YEAR|Max_UP|Tot_Qty|
+-----------+---------+--------+------+-------+
|Unspecified|    84077|    2011|  0.29|   96.0|
|Unspecified|   47021G|    2011|  0.39|   48.0|
|Unspecified|    84992|    2011|  0.55|   48.0|
|Unspecified|    23076|    2011|  1.25|   48.0|
|Unspecified|    22439|    2011|  0.65|   40.0|
|Unspecified|    21212|    2011|  0.55|   37.0|
|Unspecified|    23310|    2011|  0.42|   36.0|
|Unspecified|    23119|    2011|  0.62|   36.0|
|Unspecified|    22952|    2011|  0.55|   36.0|
|Unspecified|    21891|    2011|  1.45|   33.0|
|Unspecified|    11001|    2011|  1.69|   32.0|
|Unspecified|    21124|    2011|  1.25|   32.0|
|Unspecified|    22555|    2011|  1.65|   30.0|
|Unspecified|    22620|    2011|  1.45|   29.0|
|Unspecified|    22540|    2011|  0.42|   28.0|
|Unspecified|    23309|    2011|  0.55|   27.0|
|Unspecified|    21977|    2011|  0.55|   26.0|
|Unspecified|    22529|    2011|  0.42| 

In [323]:
df_s1.select("Country",col("Tot_Qty"),col("Tot_Qty").between(20,40)).show()

+---------+-------+-------------------------------------+
|  Country|Tot_Qty|((Tot_Qty >= 20) AND (Tot_Qty <= 40))|
+---------+-------+-------------------------------------+
|Australia|   12.0|                                false|
|Australia|  252.0|                                false|
|Australia|   48.0|                                false|
|Australia|  168.0|                                false|
|Australia|  110.0|                                false|
|Australia|  144.0|                                false|
|Australia|  111.0|                                false|
|Australia|   92.0|                                false|
|Australia|   12.0|                                false|
|Australia|  160.0|                                false|
|Australia|  228.0|                                false|
|Australia|  288.0|                                false|
|Australia|   20.0|                                 true|
|Australia|   36.0|                                 true|
|Australia|  1

In [328]:
df_s1.select(col("Max_UP").cast(DoubleType())).show()

+------+
|Max_UP|
+------+
|  0.83|
|  0.85|
|  0.39|
|  8.95|
|  4.95|
|  1.45|
|  4.95|
|  1.65|
|  1.95|
|  1.65|
|  1.45|
|  1.45|
|  6.75|
|  2.55|
|  0.55|
|  3.75|
|  1.04|
|  2.89|
|  1.65|
|  1.45|
+------+
only showing top 20 rows



In [330]:
df_s1.select(col("Max_UP").cast('double'))

DataFrame[Max_UP: double]

In [331]:
df_s1.select(col("Max_UP").astype('double'))

DataFrame[Max_UP: double]

In [348]:
df_s1.select(col("Country"),col("Country").contains('United').alias("ContainsUnit"))\
.where(col("ContainsUnit")==True)\
.show()

+--------------+------------+
|       Country|ContainsUnit|
+--------------+------------+
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
|United Kingdom|        true|
+--------------+------------+
only showing top 20 rows



In [349]:
from pyspark.sql import Row

dfA = spark.createDataFrame([Row(id=1, value='foo')
                            ,Row(id=2, value=None)
                            ])

In [351]:
dfA.show()

+---+-----+
| id|value|
+---+-----+
|  1|  foo|
|  2| null|
+---+-----+



In [357]:
dfA.select(col("value") == 'foo'
           , col("value").eqNullSafe('foo')
           , col("value").eqNullSafe(None)
          )\
.show()

+-------------+---------------+----------------+
|(value = foo)|(value <=> foo)|(value <=> NULL)|
+-------------+---------------+----------------+
|         true|           true|           false|
|         null|          false|            true|
+-------------+---------------+----------------+



In [359]:
dfB = spark.createDataFrame([Row(value = 'bar'),
                             Row(value = None)])

In [360]:
dfA.join(dfB, dfA.value == dfB.value).show()

+---+-----+-----+
| id|value|value|
+---+-----+-----+
+---+-----+-----+



In [361]:
dfA.join(dfB, dfA.value.eqNullSafe(dfB.value)).show()

+---+-----+-----+
| id|value|value|
+---+-----+-----+
|  2| null| null|
+---+-----+-----+



In [362]:
dfC = spark.createDataFrame([Row(id=1, value=float('NaN')),
                             Row(id=2, value=42.0),
                             Row(id=3, value=None)
                            ])

In [370]:
dfC.select(col("value").eqNullSafe(None),
           col("value").eqNullSafe(float('NaN')),
           col("value").eqNullSafe(42.0)
    ).show()

+----------------+---------------+----------------+
|(value <=> NULL)|(value <=> NaN)|(value <=> 42.0)|
+----------------+---------------+----------------+
|           false|           true|           false|
|           false|          false|            true|
|            true|          false|           false|
+----------------+---------------+----------------+



In [377]:
dfD = spark.createDataFrame([Row(r=Row(a=1, bb="b"))])

In [379]:
dfD.select( dfD.r.getField("bb")).show()

+----+
|r.bb|
+----+
|   b|
+----+



In [381]:
dfD.select(dfD.r.a).show()

+---+
|r.a|
+---+
|  1|
+---+



In [388]:
df_sales.select(col("Country"),col("Country").isin('France','Australia').alias("SelectedCountry"))\
.where("SelectedCountry")\
.distinct()\
.show()

+---------+---------------+
|  Country|SelectedCountry|
+---------+---------------+
|Australia|           true|
|   France|           true|
+---------+---------------+



In [389]:
df_sales.select(col("Country"),col("Country").like('Fra%').alias("SelectedCountry"))\
.where("SelectedCountry")\
.distinct()\
.show()

+-------+---------------+
|Country|SelectedCountry|
+-------+---------------+
| France|           true|
+-------+---------------+



In [390]:
df_sales.select(col("Country"),col("Country").like('Fra%').name("SelectedCountry"))\
.where("SelectedCountry")\
.distinct()\
.show()

+-------+---------------+
|Country|SelectedCountry|
+-------+---------------+
| France|           true|
+-------+---------------+



In [393]:
df_sales.select("Country", when(col("Country") =='France', 'In_France').otherwise('Out_France'))\
.distinct()\
.show(40)

+--------------------+---------------------------------------------------------------+
|             Country|CASE WHEN (Country = France) THEN In_France ELSE Out_France END|
+--------------------+---------------------------------------------------------------+
|              Poland|                                                     Out_France|
|      Czech Republic|                                                     Out_France|
|           Lithuania|                                                     Out_France|
|     Channel Islands|                                                     Out_France|
|             Austria|                                                     Out_France|
|             Lebanon|                                                     Out_France|
|             Germany|                                                     Out_France|
|               Italy|                                                     Out_France|
|             Denmark|                     

In [398]:
from pyspark.sql import Catalog

In [406]:
spark.catalog.listDatabases()

[Database(name='anil_test', description='', locationUri='file:/home/anil/Anil_Created_Docs/01.02.BID_DATA/PySPark/Practice-from-Spark-Definitive-Guide/spark-warehouse/anil_test.db'),
 Database(name='default', description='Default Hive database', locationUri='file:/home/anil/Anil_Created_Docs/01.02.BID_DATA/PySPark/Practice-from-Spark-Definitive-Guide/spark-warehouse'),
 Database(name='some_db', description='', locationUri='file:/home/anil/Anil_Created_Docs/01.02.BID_DATA/PySPark/Practice-from-Spark-Definitive-Guide/spark-warehouse/some_db.db')]

In [407]:
spark.catalog.listTables()

[Table(name='bucketedfiles', database='default', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='flight_from_select', database='default', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='flights', database='default', description=None, tableType='EXTERNAL', isTemporary=False),
 Table(name='hive_flights', database='default', description=None, tableType='EXTERNAL', isTemporary=False),
 Table(name='hive_flights_2', database='default', description=None, tableType='EXTERNAL', isTemporary=False),
 Table(name='just_usa_view', database='default', description=None, tableType='VIEW', isTemporary=False),
 Table(name='list_view', database='default', description=None, tableType='VIEW', isTemporary=False),
 Table(name='nested_data', database='default', description=None, tableType='VIEW', isTemporary=False),
 Table(name='partitioned_flights', database='default', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='df_s2', database=None,

In [408]:
spark.catalog.listFunctions

<bound method Catalog.listFunctions of <pyspark.sql.catalog.Catalog object at 0x7ff1fdfb2b00>>