In [1]:
#df = spark.read.format("csv").option("header" ,True).option("inferSchema" , True)\
df = spark.read.format("csv").option("header" ,"true").option("inferSchema" , "true")\
.load("2010-12-01.csv")

In [2]:

df.printSchema()
df.createOrReplaceTempView("dftable")

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [3]:
# the first thing to learn => Lit function 
# lit used to convert from native types to spark corresponding ones.

In [4]:
from pyspark.sql.functions import lit
df.select (lit(5), lit("five") , lit(5.0))

DataFrame[5: int, five: string, 5.0: double]

In [5]:
from pyspark.sql.functions import col
df.where("InvoiceNo != 536365").select ("InvoiceNo","Description").show(5,False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows



In [6]:
df.where("InvoiceNo = 536365").select ("InvoiceNo","Description").show(5,False)

+---------+-----------------------------------+
|InvoiceNo|Description                        |
+---------+-----------------------------------+
|536365   |WHITE HANGING HEART T-LIGHT HOLDER |
|536365   |WHITE METAL LANTERN                |
|536365   |CREAM CUPID HEARTS COAT HANGER     |
|536365   |KNITTED UNION FLAG HOT WATER BOTTLE|
|536365   |RED WOOLLY HOTTIE WHITE HEART.     |
+---------+-----------------------------------+
only showing top 5 rows



In [7]:
#multi part of boolen filters
from pyspark.sql.functions import instr
pricefilter = col("UnitPrice") >600
descfilter = instr(df.Description , "positive") >=1
df.where (df.StockCode.isin("DOT")).where (pricefilter|descfilter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [8]:
dotcodefilter = df.StockCode == "DOT"
pricefilter = col("UnitPrice") >600
descfilter = instr(df.Description , "positive") >=1
df.withColumn("IsExpensive",dotcodefilter & (pricefilter |descfilter )).where("IsExpensive")\
.select ("UnitPrice" ,"IsExpensive" ).show(5)

+---------+-----------+
|UnitPrice|IsExpensive|
+---------+-----------+
|   607.49|       true|
+---------+-----------+



In [9]:
from pyspark.sql.functions import expr
df.withColumn("IsExpensive", expr("not UnitPrice <= 250" ) ).where("IsExpensive")\
.select ("UnitPrice" ,"IsExpensive" ).show(5)

+---------+-----------+
|UnitPrice|IsExpensive|
+---------+-----------+
|   569.77|       true|
|   607.49|       true|
+---------+-----------+



In [10]:
df.withColumn("IsExpensive", col("UnitPrice") >= 250 ).where("IsExpensive")\
.select ("UnitPrice" ,"IsExpensive" ).show(5)

+---------+-----------+
|UnitPrice|IsExpensive|
+---------+-----------+
|   569.77|       true|
|   607.49|       true|
+---------+-----------+



In [11]:
# if you are working with collumns with null
df.where(df.Description.eqNullSafe("hello")).show(5)

+---------+---------+-----------+--------+-----------+---------+----------+-------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+-----------+--------+-----------+---------+----------+-------+
+---------+---------+-----------+--------+-----------+---------+----------+-------+



# Working With Numbers

In [15]:
# let's assume that the quantity = (quantity * unitprice)^2 +5
from pyspark.sql.functions import expr , pow
fabricatedquantity = pow(df.Quantity*df.UnitPrice, 2)+5
df.select(df.CustomerID ,df.Quantity,df.UnitPrice,fabricatedquantity.alias("RealQuantity")).show(5)

+----------+--------+---------+------------------+
|CustomerID|Quantity|UnitPrice|      RealQuantity|
+----------+--------+---------+------------------+
|   17850.0|       6|     2.55|239.08999999999997|
|   17850.0|       6|     3.39|          418.7156|
|   17850.0|       8|     2.75|             489.0|
|   17850.0|       6|     3.39|          418.7156|
|   17850.0|       6|     3.39|          418.7156|
+----------+--------+---------+------------------+
only showing top 5 rows



In [33]:
# note that selectexpr uses all columns with "" not in [col , df.name] , all inside will be qouted 
df.selectExpr("CustomerID" ,"(power((Quantity*UnitPrice), 2)+5) as RealQuantity").show(2)

+----------+------------------+
|CustomerID|      RealQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In [37]:
#round numbers .   round => round up .   bround => round down
from pyspark .sql.functions import round, bround
df.select (lit("2.5").alias("round") , lit("2.5")).show(2)

+-----+---+
|round|2.5|
+-----+---+
|  2.5|2.5|
|  2.5|2.5|
+-----+---+
only showing top 2 rows



In [39]:
df.select (round (lit("2.5")) , bround (lit("2.5") )).show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



In [46]:
# calculate the correlation between two columns
from pyspark.sql.functions import corr 
print ( df.stat.corr("Quantity" , "UnitPrice") )
df.select (corr ("Quantity" , "UnitPrice").alias("corrr")).show(2) 

-0.04112314436835551
+--------------------+
|               corrr|
+--------------------+
|-0.04112314436835551|
+--------------------+



In [49]:
# to see summary statistics for column or set of columns , use descibe method
df.describe("Description","UnitPrice","CustomerID").show()
#df.describe().show() # for all columns in DF if you want
# if  you want exact measure from this ,pyspark.sql.functions import count , mean , min,max

+-------+--------------------+------------------+------------------+
|summary|         Description|         UnitPrice|        CustomerID|
+-------+--------------------+------------------+------------------+
|  count|                3098|              3108|              1968|
|   mean|                null| 4.151946589446603|15661.388719512195|
| stddev|                null|15.638659854603892|1854.4496996893627|
|    min| 4 PURPLE FLOCK D...|               0.0|           12431.0|
|    max|ZINC WILLIE WINKI...|            607.49|           18229.0|
+-------+--------------------+------------------+------------------+



In [53]:
# cross tab and frequent items pairs
#df.stat.crosstab("StockCode", "Quantity").show()
df.stat.freqItems(["StockCode", "Quantity"]).show()

+--------------------+--------------------+
| StockCode_freqItems|  Quantity_freqItems|
+--------------------+--------------------+
|[90214E, 20728, 2...|[200, 128, 23, 32...|
+--------------------+--------------------+



In [56]:
# to add numbering column start from 0
from pyspark.sql.functions import monotonically_increasing_id
df.select(monotonically_increasing_id()).show(3)

+-----------------------------+
|monotonically_increasing_id()|
+-----------------------------+
|                            0|
|                            1|
|                            2|
+-----------------------------+
only showing top 3 rows



# Working With Strings

In [60]:
# the first function in inticap that captitalize every word in santance
from pyspark.sql.functions import initcap
df.select(initcap(df.Description)).show(2)

+--------------------+
|initcap(Description)|
+--------------------+
|White Hanging Hea...|
| White Metal Lantern|
+--------------------+
only showing top 2 rows



In [61]:
# we can also make the whole sentence as capital or small , with [lower, upper]
from pyspark.sql.functions import lower , upper
df.select (df.Description , lower(df.Description) , upper(df.Description)).show(2)

+--------------------+--------------------+--------------------+
|         Description|  lower(Description)|  upper(Description)|
+--------------------+--------------------+--------------------+
|WHITE HANGING HEA...|white hanging hea...|WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern| WHITE METAL LANTERN|
+--------------------+--------------------+--------------------+
only showing top 2 rows



In [69]:
# adding or removing spaces from string , using [ltrim, rtrim , lpad,rpad, trim]
from pyspark.sql.functions import ltrim, rtrim , lpad,rpad, trim
df.select(ltrim(lit("     hello     ")).alias("ltrim"),
          rtrim(lit("     hello     ")).alias("rtrim"),
          lpad(lit("hello") , 3," ").alias("lpad"),
          rpad(lit("hello"),2," ").alias("rpad"),
          trim(lit("   hello      ")).alias("trim"),
         ).show(3)

+----------+----------+----+----+-----+
|     ltrim|     rtrim|lpad|rpad| trim|
+----------+----------+----+----+-----+
|hello     |     hello| hel|  he|hello|
|hello     |     hello| hel|  he|hello|
|hello     |     hello| hel|  he|hello|
+----------+----------+----+----+-----+
only showing top 3 rows



In [74]:
# regular expressions 
from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE" 
# regexp_replace take 3 arguments (the original column ,the strings should i found them , the replacing string  )
df.select(df.Description,\
          regexp_replace(df.Description,regex_string,"COLOR").alias("clean_desc")).show(2)

+--------------------+--------------------+
|         Description|          clean_desc|
+--------------------+--------------------+
|WHITE HANGING HEA...|COLOR HANGING HEA...|
| WHITE METAL LANTERN| COLOR METAL LANTERN|
+--------------------+--------------------+
only showing top 2 rows



In [77]:
# spark also provide avery effictive function to replace chars,
# fun [translate] replace chars with indexed chars in other string 
from pyspark.sql.functions import translate
df.select(df.Description ,translate(df.Description, "LEET" , "1337").alias("Clean Text")).show(2)    

+--------------------+--------------------+
|         Description|          Clean Text|
+--------------------+--------------------+
|WHITE HANGING HEA...|WHI73 HANGING H3A...|
| WHITE METAL LANTERN| WHI73 M37A1 1AN73RN|
+--------------------+--------------------+
only showing top 2 rows



In [89]:
from pyspark.sql.functions import regexp_extract
# regexp_extract => pulling out first mentioned color
regex_string = "(BLACK|WHITE|RED|GREEN|BLUE)"
#note that the regex_string must must be tuple ==> like r'(\d+)'
df.select (df.Description,\
           regexp_extract(df.Description,regex_string,1).alias("CleanText")).show(2)

+--------------------+---------+
|         Description|CleanText|
+--------------------+---------+
|WHITE HANGING HEA...|    WHITE|
| WHITE METAL LANTERN|    WHITE|
+--------------------+---------+
only showing top 2 rows



In [101]:
# if we want to check for existance in such string .contain function will do 
# corresponding to contains there is "instr" function in python .
from pyspark.sql.functions import instr
# instr : position (1 or greater)of the first occurrence of substr column in the given string 
# where("hassimplercolor") returns true values only 
containblack = instr(df.Description,"BLACK") >=1 # this returns true or false
containwhite = instr(df.Description,"WHITE") >=1
df.withColumn("hassimplercolor" , containblack|containwhite)\
.where("hassimplercolor").select (df.Description , col("hassimplercolor")).show(3)

+--------------------+---------------+
|         Description|hassimplercolor|
+--------------------+---------------+
|WHITE HANGING HEA...|           true|
| WHITE METAL LANTERN|           true|
|RED WOOLLY HOTTIE...|           true|
+--------------------+---------------+
only showing top 3 rows



# working with Date andTime

In [None]:
#spark stores timestamps and dates as strings and convert them into datatypes at runtime,
# this is much more when working with textfiles and csv 
#sparks timestamptype support only second level percision,if you want to work with millsecond or microsecond treat them as long

In [4]:
from pyspark.sql.functions import current_date , current_timestamp
datedf = spark.range(10).withColumn("today" , current_date()).withColumn("now",current_timestamp())

In [8]:
datedf.show(1)

+---+----------+--------------------+
| id|     today|                 now|
+---+----------+--------------------+
|  0|2020-10-20|2020-10-20 23:11:...|
+---+----------+--------------------+
only showing top 1 row



In [9]:
datedf.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



In [11]:
#subtract some days from date column
from pyspark.sql.functions import date_add, date_sub
datedf.select(date_add(datedf.today ,5).alias("add") ,date_sub(datedf.today ,5) ).show(2)

+----------+------------------+
|       add|date_sub(today, 5)|
+----------+------------------+
|2020-10-25|        2020-10-15|
|2020-10-25|        2020-10-15|
+----------+------------------+
only showing top 2 rows



In [36]:
#difference between two dates 
from pyspark.sql.functions import datediff, months_between ,to_date , col ,lit
datedf.withColumn("weekago" ,date_sub(datedf.today,7))\
.select(datediff(col("weekago") , col("today") )).show(1)

+------------------------+
|datediff(weekago, today)|
+------------------------+
|                      -7|
+------------------------+
only showing top 1 row



In [38]:
#months betwen 
datedf.select(to_date(lit("2016-01-01")).alias("start"),to_date(lit("2016-05-022")).alias("end"))\
.select(months_between(col("start"), col("end") )).show(2)


+--------------------------------+
|months_between(start, end, true)|
+--------------------------------+
|                     -4.67741935|
|                     -4.67741935|
+--------------------------------+
only showing top 2 rows



In [39]:
# to_date function is used to transform from string to date type
from pyspark.sql.functions import lit, to_date
spark.range(5).withColumn("date" , lit("2017-01-01")).show(1)

+---+----------+
| id|      date|
+---+----------+
|  0|2017-01-01|
|  1|2017-01-01|
|  2|2017-01-01|
|  3|2017-01-01|
|  4|2017-01-01|
+---+----------+



In [40]:
spark.range(5).withColumn("date" , lit("2017-01-01")).select(to_date(col("date"))).show(1)

+---------------+
|to_date(`date`)|
+---------------+
|     2017-01-01|
|     2017-01-01|
|     2017-01-01|
|     2017-01-01|
|     2017-01-01|
+---------------+



In [43]:
# very impportant note , 
# spark will not throw an error if spark can't parse  the date , it will return null,
datedf.select(to_date(lit("2016-20-12")), to_date(lit("2016-02-12"))).show(1)
# it will expect certain format and will treat others with null
# spark follow java standard timestamp

+---------------------+---------------------+
|to_date('2016-20-12')|to_date('2016-02-12')|
+---------------------+---------------------+
|                 null|           2016-02-12|
+---------------------+---------------------+
only showing top 1 row



In [58]:
# we use to_date , to_timestamp to solve the above issue
from pyspark.sql.functions import to_date
dateformat =  "yyyy-dd-MM"
cleandatedf = spark.range(1).select(
    to_date(lit("2017-12-11") , dateformat).alias("date1"),
    to_date(lit("2017-20-12"), dateformat).alias("date2")).show()

+----------+----------+
|     date1|     date2|
+----------+----------+
|2017-11-12|2017-12-20|
+----------+----------+



# working with nulls

In [70]:
# frist functions is coalesce to return the first not null value , from set of columns
from pyspark.sql.functions import coalesce
df.select(coalesce(df.CustomerID ,df.Description)).show(2)

+---------------------------------+
|coalesce(CustomerID, Description)|
+---------------------------------+
|                          17850.0|
|                          17850.0|
+---------------------------------+
only showing top 2 rows



In [66]:
#some functions to treat nulls
#ifnull , returns the second value if the first is null 
#nullif ,  return null if the two values are equal, return the second if not 
# nvl , nvl2

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country']

In [71]:
df.na.drop()
df.na.drop("any") # drop rows if any of it's columns in null
df.na.drop("all") # drop rows if all columns in row is null
df.na.drop("all" , subset = ["StockCode" ,"InvoiceNo"]) 

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]

In [73]:
# fill function to fill  na
df.na.fill("all null values become this string")# of type string 
#df.na.fill(5:Integer)
fill_colls_values = {"StockCode":5 ,"Description":"no value"}
df.na.fill(fill_colls_values)

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]

In [74]:
df.na.replace([""],["unknown"],"Description")

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: double, Country: string]

# working with complex types (structs , array, maps)

In [8]:
# we can think of struct as dataframe within dataframe 
from pyspark.sql.functions import struct , col
complexdf = df.select (struct("Description" , "InvoiceNo").alias("complex"))
complexdf.createOrReplaceTempView("complexdf")

In [13]:
#df.select ("complexdf.*")
#df.select (col("complexdf").getfield("Description"))

## Arrays

In [24]:
#convert descriprion column to cpmlex type ,"array"
from pyspark.sql.functions import split
df.select (split(col("Description"), " ").alias("array_col")).show(2)

+--------------------+
|           array_col|
+--------------------+
|[WHITE, HANGING, ...|
|[WHITE, METAL, LA...|
+--------------------+
only showing top 2 rows



In [29]:
df.select(split(df.Description , " ").alias("array_col")).selectExpr("array_col[0]").show(2)

+------------+
|array_col[0]|
+------------+
|       WHITE|
|       WHITE|
+------------+
only showing top 2 rows



In [31]:
# query the array size 
from pyspark.sql.functions import size
df.select(size (split(df.Description , " ")).alias("array_size")).show(2)

+----------+
|array_size|
+----------+
|         5|
|         3|
+----------+
only showing top 2 rows



In [33]:
#check if this array contains value "array_contains"
from pyspark.sql.functions import array_contains
df.select(array_contains(split(df.Description , " "),"WHITE").alias("contains")).show(2)

+--------+
|contains|
+--------+
|    true|
|    true|
+--------+
only showing top 2 rows



In [None]:
# to convert complex type into set of rows (one per value in our array). , function "explode"
#explode takes column that consist of arrays , and create new rows , 

In [44]:
from pyspark.sql.functions import split, explode
ahmeddf = df.withColumn("splitted" , split(df.Description, " ")).withColumn("explodded" ,explode(col("splitted")))\
.select ("Description", "StockCode","explodded")

In [46]:
ahmeddf.show(2)

+--------------------+---------+---------+
|         Description|StockCode|explodded|
+--------------------+---------+---------+
|WHITE HANGING HEA...|   85123A|    WHITE|
|WHITE HANGING HEA...|   85123A|  HANGING|
+--------------------+---------+---------+
only showing top 2 rows



## working with maps to be in format key -value pairs

In [48]:
from pyspark.sql.functions import create_map
df.select(create_map(df.Description , df.InvoiceNo).alias("complex_map")).show(2)

+--------------------+
|         complex_map|
+--------------------+
|[WHITE HANGING HE...|
|[WHITE METAL LANT...|
+--------------------+
only showing top 2 rows



In [None]:
#df.select(create_map(df.Description , df.InvoiceNo).alias("complex_map"))\
#.selectExpr("complex_map['any text here to be matched']").show(2)

In [47]:
df.columns

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country']

# working with user defined function UDF

In [52]:
udfexampledf = spark.range(5).toDF("num") # create df with column named "num" and values from 0 to 4
def power3 (double_value):
    return double_value**3
power3(3) # here i execute the function with static number

27

In [50]:
udfexampledf.show()

+---+
|num|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+

