### Spark SQL Pre-Defined Functions
- String Manipulating
- Date Manipulating
- Numeric Functions
- Type Conversion
- and more
- CASE and WHEN
- Keep in mind that aggregate functions cannot be used in WHERE, GROUP BY in Spark

In [0]:
%sql
-- SHOW FUNCTIONS

In [0]:
%sql
DESCRIBE FUNCTION substr

function_desc
Function: substr
Class: org.apache.spark.sql.catalyst.expressions.Substring
"Usage: substr(str, pos[, len]) - Returns the substring of `str` that starts at `pos` and is of length `len`, or the slice of byte array that starts at `pos` and is of length `len`.  substr(str FROM pos[ FOR len]]) - Returns the substring of `str` that starts at `pos` and is of length `len`, or the slice of byte array that starts at `pos` and is of length `len`."


In [0]:
%sql
SELECT substr('Hello world', 1, 5)

"substr(Hello world, 1, 5)"
Hello


#### STRING MANIPULATING FUNCTIONS
- Case Conversion: `upper`, `lower`, `inintcap`
- Size: `length`
- Extract: `substr`, `split`
- Trimming & Padding: `trim`, `rtrim`, `ltrim`, `rpad`, `lpad`
- Reverse: `reverse`
- Concatenate: `concat`, `concat_ws`
- Other useful: `lit`, `col`, `cast`, `explode`

In [0]:
%sql
-- Case Conversion 
SELECT lower('hEllo wOrlD')
, upper('hEllo wOrlD')
, initcap('hEllo wOrlD')

lower(hEllo wOrlD),upper(hEllo wOrlD),initcap(hEllo wOrlD)
hello world,HELLO WORLD,Hello World


In [0]:
%sql
-- Size or length of column value
SELECT length('hEllo wOrlD')

length(hEllo wOrlD)
11


In [0]:
%sql
-- Extract 
SELECT substr('123456789', -4) as result
     , substr('123456789', 6, 2) as result2  -- substr(string, start, len)
     , substr('ITVersity', 1, 3) as result3
     , substr(current_timestamp, 1, 10) as result4 -- can also extract date, But Note: we use DATE_FORMAT 

result,result2,result3,result4
6789,67,ITV,2023-10-03


In [0]:
%sql
USE itversity_retail_db;
SHOW TABLES;

database,tableName,isTemporary
itversity_retail_db,order_items,False
itversity_retail_db,orders,False


In [0]:
%sql
SELECT split('1234567890, 1234567891', ',') as result -- split(string, delimiter): returns array
     , split('1234567890, 1234567891', ',')[0] as result2

result,result.1
"List(1234567890, 1234567891)",1234567890


In [0]:
%sql
SELECT explode(split('1234567890, 1234567891', ',') )as result -- explode will flatten data

result
1234567890
1234567891


In [0]:
%sql
-- Trimming & Padding
SELECT ltrim('    Hello World') as result
     , rtrim('Hello World    ') as result2
     , trim('    Hello World    ') as result3
     , trim(LEADING   'x'     FROM  'xxxxHello Worldyyyy') as result4      -- LEADING  means from the begining/left 
     , trim(TRAILING  'y'     FROM  'xxxxHello Worldyyyy') as result5      -- TRAILING means from the last/right
     , trim(BOTH      'xy'    FROM  'xxxxHello Worldyyyy') as result6
     , trim(BOTH      'xyXY'  FROM  'xYxYHello WorldyXyX') as result7
     


result,result2,result3,result4,result5,result6,result7
Hello World,Hello World,Hello World,Hello Worldyyyy,xxxxHello World,Hello World,Hello World


In [0]:
%sql
SELECT 2013 AS year, 7 AS month, 25 AS my_date -- ==> 2013-07-25

year,month,my_date
2013,7,25


In [0]:
%sql
-- CREATE TABLE sales_fact(
--   sale_year INT,
--   sale_month INT,
--   sale_day INT,
--   order_revenue FLOAT,
--   order_count INT
-- );

-- INSERT INTO sales_fact values
-- (2022, 1, 1, 1000.0, 3),
-- (2022, 1, 10, 1250.0, 2),
-- (2022, 2, 5, 1300.00, 5);

SELECT * FROM sales_fact;

sale_year,sale_month,sale_day,order_revenue,order_count
2022,1,1,1000.0,3
2022,1,10,1250.0,2
2022,2,5,1300.0,5


In [0]:
 %sql
 SELECT concat_ws( '-'
    , sale_year
    , lpad(sale_month, 2, 0)
    , lpad(sale_day, 2, 0)
  ) as date
    , order_revenue
    , order_count
  FROM sales_fact;


date,order_revenue,order_count
2022-01-01,1000.0,3
2022-01-10,1250.0,2
2022-02-05,1300.0,5


In [0]:
# %sql
# DROP TABLE sales_fact

In [0]:
%sql
-- Concat & Reverse
SELECT reverse('Hello World') as result
      , concat('Hello', ' ','World', ' ', 'how', ' ', 'are', ' ', 'you') as result2
      , concat_ws(' ', 'Hello', 'World', 'how', 'are', 'you') as result2
      , ARRAY('123456789', '987654321', '345678912') as result4
      , concat_ws(', ', ARRAY('123456789', '987654321', '345678912')) as result3
      


result,result2,result2.1,result4,result3
dlroW olleH,Hello World how are you,Hello World how are you,"List(123456789, 987654321, 345678912)","123456789, 987654321, 345678912"


In [0]:
%sql
SELECT DISTINCT concat('Order Status is ', order_status) AS result 
FROM orders LIMIT 10

result
Order Status is PROCESSING
Order Status is PENDING_PAYMENT
Order Status is CLOSED
Order Status is SUSPECTED_FRAUD
Order Status is CANCELED
Order Status is COMPLETE
Order Status is ON_HOLD
Order Status is PENDING
Order Status is PAYMENT_REVIEW


### DATE MANIPULATION FUNCTIONS
- Current Date & Time stamp: `current_date`, `current_timestamp`  yyyy-MM-ddTHH:mm:ss.SS
- Date & Time arithmetics: `date_add`, `datediff`, `add_month`, `months_between`
- Beginning Date or Time: `trunc`, `date_trunc`
- Date & Time Extract: `date_format` covers all (`year`, `month`,  `dayofyear`, `dayofmonth`, `hour`, `minute`, `second`),
 `weekofyear`, `dayofweek`
- Non-standard Date & Time to Standard Date & Time: `to_date`, `to_timestamp`
- Dealing with nix timestamp: `from_unixtime`, `unix_timestamp`, `to_unix_timestamp`

In [0]:
%sql
-- Date Arithmetics + current_date, current_timestamp
SELECT current_date as res1
     , current_timestamp as res2
     , date_add('2023-10-03', 5) as res3
     , date_add('2023-10-03', -5) as res4 -- date_add also can be used as date_sub
     , datediff('2023-10-03', '2023-09-28') as res5
     , add_months('2023-10-03', 2) as res6
     , months_between('2023-12-03', '2023-10-03') as res7

-- MySQL supported
-- SELECT 
--        date_add('2023-10-03', interval 5 day) as res3
--      , date_add('2023-10-03', interval -5 day) as res4 
--      , datediff('2023-10-03', '2023-09-28') as res5
  

res1,res2,res3,res4,res5,res6,res7
2023-10-03,2023-10-03T06:25:41.163+0000,2023-10-08,2023-09-28,5,2023-12-03,2.0


In [0]:
%sql
-- Begining of Date & Time
SELECT trunc(current_date, 'MM') as begining_date_of_month -- with TRUNC only begining of month or begining of year
     , trunc(current_date, 'yy') as begining_date_of_year
     , date_trunc('HOUR',current_timestamp) as begining_hour 
     , date_trunc('MINUTE',current_timestamp) as begining_minute
     , date_trunc('DAY',current_timestamp) as begining_day

begining_date_of_month,begining_date_of_year,begining_hour,begining_minute,begining_day
2023-10-01,2023-01-01,2023-10-03T06:00:00.000+0000,2023-10-03T06:36:00.000+0000,2023-10-03T00:00:00.000+0000


In [0]:
%sql
-- Extrating Date using date_format
SELECT current_timestamp as res1                             
, date_format(current_timestamp, 'yyyy') as yr -- or year()              
, date_format(current_timestamp, 'MM') as mon  -- or month()             
, date_format(current_timestamp, 'dd') as day    -- or dayofmonth() 
, date_format(current_timestamp, 'DDD') as day_of_yr    -- or dayofyear() OR Julian Representation of Date          
, date_format(current_timestamp, 'MMM') as abb_month          
, date_format(current_timestamp, 'MMMM') as full_month       
, date_format(current_timestamp, 'EE') as abb_day           
, date_format(current_timestamp, 'EEEE') as full_day         
, date_format(current_timestamp, 'HH') as hr24               
, date_format(current_timestamp, 'hh') as hr12   -- or hour()            
, date_format(current_timestamp, 'mm') as mins   -- or minute()           
, date_format(current_timestamp, 'ss') as secs   -- or seconds           
, date_format(current_timestamp, 'SS') as millis  
, date_format(current_timestamp, 'SSS') as millis3            

-- Note: date_format returns string


res1,yr,mon,day,day_of_yr,abb_month,full_month,abb_day,full_day,hr24,hr12,mins,secs,millis,millis3
2023-10-03T07:36:08.987+0000,2023,10,3,276,Oct,October,Tue,Tuesday,7,7,36,8,98,987


In [0]:
%sql
-- Calander Date Functions: Note most are covered by date_format 
SELECT weekofyear(current_timestamp) as wk_of_yr
     , dayofweek(current_timestamp) as day_of_wk
   

wk_of_yr,day_of_wk
40,3


In [0]:
%sql
SELECT date_format(current_timestamp, 'yyyyMMdd') as cur_date
     , date_format(current_timestamp, 'yyyyDDD') as cur_yr_and_day_of_the_year
     , date_format(current_timestamp, 'dd MMMM, yyyy') as cur_date2
     , date_format(current_timestamp, 'yyyy/MM/dd') as cur_date3

cur_date,cur_yr_and_day_of_the_year,cur_date2,cur_date3
20231003,2023276,"03 October, 2023",2023/10/03


In [0]:
%sql
-- Converting Non-standard Dates and Timestamps into Standard Date & Timestamp: to_date, to_timestamp
-- It is very important to convert, so that we can apply robust functions like date_format 
SELECT
  to_date('2022/1/16', 'yyyy/M/dd') as res1 --both the string & format should be same
, to_timestamp('2022/1/16 18:24', 'yyyy/M/dd HH:mm') as res2
, to_date(20221015, 'yyyyMMdd') as res3
, to_date(2022090, 'yyyyDDD') as res4 --Julian Representation of Date
, trunc(to_date(2022090, 'yyyyDDD'), 'MM') as res5
, date_format(to_date(2022090, 'yyyyDDD'), 'EEEE') as res6

res1,res2,res3,res4,res5,res6
2022-01-16,2022-01-16T18:24:00.000+0000,2022-10-15,2022-03-31,2022-03-01,Thursday


#### Dealing with Unix Timestamp
* `from_unixtime` : can be used to convert unix epoch to readable date or timestamp based on specified format
* `unix_timestamp` or `to_unix_timestamp` : can be used to convert date or timestamp based on specified format to unix epoch.
* What is unix epoch? : on 1970 June 1st the unix epoch have started, since then every second is incremented by 1
* to get the unix epoch for current time using `date '+%s'` in Linux/Unix terminal

In [0]:
# unix epoch for current timestamp: integer representation of date or timestamp
!date '+%s'

1696320372


In [0]:
%sql
SELECT from_unixtime(1696320372) as res1
     , from_unixtime(1696320372, 'yyyyMM') as res2
     , to_unix_timestamp('2023-10-03 08:06:12') as res3 -- Note: date should be in standard format

res1,res2,res3
2023-10-03 08:06:12,202310,1696320372
