# functions,columns,rows

### 0 -getting lab data 

In [4]:
import pandas as pd 
pandas_df_transactions=pd.read_csv('transactions.txt',sep="\t")
pandas_df_magasins=pd.read_csv('magasins.txt',sep="\t")

In [5]:
df_transactions=spark.createDataFrame(pandas_df_transactions)
df_transactions.printSchema()
df_transactions.first()

root
 |-- transaction_id: long (nullable = true)
 |-- date: string (nullable = true)
 |-- magasin: long (nullable = true)
 |-- quantity: long (nullable = true)
 |-- unit_price: long (nullable = true)



Row(transaction_id=0, date=u'2016-06-01', magasin=39, quantity=23, unit_price=12)

In [6]:
df_magasins=spark.createDataFrame(pandas_df_magasins)
df_magasins.printSchema()
df_magasins.first()

root
 |-- magasin: long (nullable = true)
 |-- pays: string (nullable = true)
 |-- ville: string (nullable = true)



Row(magasin=0, pays=u'allemagne', ville=u'munich')

### a - functions

In [7]:
import pyspark.sql.functions as F 

In [9]:
df_trs=df_transactions
df_trs

DataFrame[transaction_id: bigint, date: string, magasin: bigint, quantity: bigint, unit_price: bigint]

#### a - 1 - aggregate functions

In [11]:
#### aggregate functions 
## could be used after a groupBy (on groups of values),
## or, as below, on the whole dataset to generate one line DF 

stats_df=(df_trs.select('*',(F.col('quantity')*F.col('unit_price')).alias('CA'))
                .filter('CA>0')
                  .select(
                        F.count('*').alias('nb_transactions'),
                        F.countDistinct('magasin').alias('nb_magasins'),
                        F.min('CA').alias('min_CA'),
                        F.max('CA').alias('max_CA'),
                        F.sum('CA').alias('sum_CA'),
                        F.mean('CA').alias('mean_CA'),
                        F.stddev('CA').alias('stdv_CA'),
                        F.variance('CA').alias('variance_CA'),
                        F.skewness('CA').alias('skewness_CA'),
                        F.kurtosis('CA').alias('kurtosis_CA'),
                        F.first('CA').alias('first_CA'),
                        F.last('CA').alias('last_CA'),
 
                       )
          )

In [12]:
stats_df.count()

1

In [13]:
stats_df.first()

Row(nb_transactions=434, nb_magasins=50, min_CA=8, max_CA=1862, sum_CA=223268, mean_CA=514.442396313364, stdv_CA=437.77632793736973, variance_CA=191648.11330232746, skewness_CA=0.9628570179886349, kurtosis_CA=0.08442603809537541, first_CA=252, last_CA=224)

In [14]:
#### d - 2 - other helpful functions

In [15]:
df2=df_trs.select('*',F.array('date', 'magasin').alias("arr"))
df2.printSchema()

root
 |-- transaction_id: long (nullable = true)
 |-- date: string (nullable = true)
 |-- magasin: long (nullable = true)
 |-- quantity: long (nullable = true)
 |-- unit_price: long (nullable = true)
 |-- arr: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [16]:
print df2.first()

Row(transaction_id=0, date=u'2016-06-01', magasin=39, quantity=23, unit_price=12, arr=[u'2016-06-01', u'39'])


In [17]:
df2.filter(F.array_contains('arr','2016-06-01')).take(5)

[Row(transaction_id=0, date=u'2016-06-01', magasin=39, quantity=23, unit_price=12, arr=[u'2016-06-01', u'39']),
 Row(transaction_id=234, date=u'2016-06-01', magasin=12, quantity=61, unit_price=4, arr=[u'2016-06-01', u'12'])]

In [18]:
df2.filter(F.array_contains('arr','2016-06-01')).select(F.collect_set('date').alias('date_set'),F.collect_list('date').alias('date_list')).first()

Row(date_set=[u'2016-06-01'], date_list=[u'2016-06-01', u'2016-06-01'])

In [19]:
#coalesce
cDf = spark.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b"))
cDf.select('*',F.coalesce(cDf["a"], cDf["b"])).show()

+----+----+--------------+
|   a|   b|coalesce(a, b)|
+----+----+--------------+
|null|null|          null|
|   1|null|             1|
|null|   2|             2|
+----+----+--------------+



In [20]:
#concat
df_trs.select('*',F.concat_ws('____','date','magasin','quantity').alias('concat_col')).first()

Row(transaction_id=0, date=u'2016-06-01', magasin=39, quantity=23, unit_price=12, concat_col=u'2016-06-01____39____23')

In [21]:
#lit , rand
df_trs.select('*',F.lit(0.5).alias('constant'), F.rand().alias('rand_uniform'), F.randn().alias('rand_normal')).take(2)

[Row(transaction_id=0, date=u'2016-06-01', magasin=39, quantity=23, unit_price=12, constant=0.5, rand_uniform=0.6888080587964535, rand_normal=-0.2179185506838038),
 Row(transaction_id=1, date=u'2016-06-02', magasin=2, quantity=9, unit_price=13, constant=0.5, rand_uniform=0.23323297345010197, rand_normal=-0.7093073445840071)]

In [22]:
# create map (spark 2 )
df_trs.select(F.create_map('date', 'magasin').alias("map")).printSchema()
print df_trs.select(F.create_map('date', 'magasin').alias("map")).first()

root
 |-- map: map (nullable = false)
 |    |-- key: string
 |    |-- value: long (valueContainsNull = true)

Row(map={u'2016-06-01': 39})


In [23]:
#hashing  (32 character hex string)
spark.createDataFrame([('ABC',)], ['a']).select('*',F.md5('a').alias('hash')).collect()

[Row(a=u'ABC', hash=u'902fbdd2b1df0c4f70b4a5d23525e932')]

In [24]:
#substring
df_trs.select('*',F.substring(df_trs.date, 6, 2).alias('month')).first()

Row(transaction_id=0, date=u'2016-06-01', magasin=39, quantity=23, unit_price=12, month=u'06')

In [25]:
# date functions 

In [26]:
df_trs_times=df_trs.select('date',
                        F.to_date('date').alias('date_as_dt'),
                        F.date_format('date','MM/dd/yyy').alias('date_reformated'),
                        F.unix_timestamp('date','yyyy-MM-dd').alias('tm_unix_date'),
                        F.hour('date').alias('hour'), # if date is like : '2015-04-08 13:08:15'
                        F.second('date').alias('second'), # if date is like : '2015-04-08 13:08:15'
                        F.minute('date').alias('minute'), # if date is like : '2015-04-08 13:08:15'
                        F.dayofmonth('date').alias('dayofmonth'),
                        F.dayofyear('date').alias('dayofyear'),
            ).select( '*',F.from_unixtime('tm_unix_date','yyyy-MM-dd HH:mm:ss').alias('date_from_unix_tm'),
                        
                        )                


In [27]:
df_trs_times.printSchema()

root
 |-- date: string (nullable = true)
 |-- date_as_dt: date (nullable = true)
 |-- date_reformated: string (nullable = true)
 |-- tm_unix_date: long (nullable = true)
 |-- hour: integer (nullable = true)
 |-- second: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- dayofmonth: integer (nullable = true)
 |-- dayofyear: integer (nullable = true)
 |-- date_from_unix_tm: string (nullable = true)



In [28]:
df_trs_times.first()

Row(date=u'2016-06-01', date_as_dt=datetime.date(2016, 6, 1), date_reformated=u'06/01/2016', tm_unix_date=1464739200, hour=0, second=0, minute=0, dayofmonth=1, dayofyear=153, date_from_unix_tm=u'2016-06-01 00:00:00')

In [29]:
df_trs_times2=(df_trs.select(
                'date',
                F.current_date().alias('dt_now'),
                F.current_timestamp().alias('tm_now'),
                F.unix_timestamp().alias('tm_unix_now'),
                F.unix_timestamp('date','yyyy-MM-dd').alias('tm_unix_date'),
                F.datediff(F.date_format(F.current_timestamp(),'yyyy-MM-dd'),
                           'date' ).alias('delta_days'))
    )


In [30]:
# make difference between date as "string", as "date" , as "timestamp" 
df_trs_times2.printSchema()

root
 |-- date: string (nullable = true)
 |-- dt_now: date (nullable = false)
 |-- tm_now: timestamp (nullable = false)
 |-- tm_unix_now: long (nullable = true)
 |-- tm_unix_date: long (nullable = true)
 |-- delta_days: integer (nullable = true)



In [31]:
df_trs_times2.first()

Row(date=u'2016-06-01', dt_now=datetime.date(2018, 1, 25), tm_now=datetime.datetime(2018, 1, 25, 22, 45, 50, 672000), tm_unix_now=1516920350, tm_unix_date=1464739200, delta_days=603)

In [32]:
# monotically increased 
df0 = df_trs.repartition(1).select('*',F.monotonically_increasing_id().alias('id2'))
df1 = df_trs.repartition(2).select('*',F.monotonically_increasing_id().alias('id2'))


In [33]:
df0.rdd.getNumPartitions()

1

In [34]:
df1.rdd.getNumPartitions()

2

In [35]:
df_trs.count()

468

In [36]:
print df0.select(F.max('id2'),F.min('id2'),F.avg('id2')).collect()


[Row(max(id2)=467, min(id2)=0, avg(id2)=233.5)]


In [37]:
print df1.select(F.max('id2'),F.min('id2'),F.avg('id2')).collect()


[Row(max(id2)=8589934825, min(id2)=0, avg(id2)=4294967412.5)]


In [38]:
# explode
df_trs_g=df_trs.groupBy('magasin').agg(F.collect_set('date').alias('dates'))

In [39]:
print df_trs_g.count()
print df_trs_g.first()

50
Row(magasin=29, dates=[u'2016-09-17', u'2016-11-12', u'2016-07-01', u'2016-07-12', u'2017-01-01', u'2016-06-12', u'2017-01-15', u'2017-01-06', u'2016-11-28', u'2016-07-05', u'2016-08-16', u'2016-09-05', u'2016-11-27', u'2016-08-30', u'2017-01-04'])


In [40]:
print df_trs_g.select('magasin',F.explode('dates').alias('date')).count()
print df_trs_g.select('magasin',F.explode('dates').alias('date')).take(3)

460
[Row(magasin=29, date=u'2016-09-17'), Row(magasin=29, date=u'2016-11-12'), Row(magasin=29, date=u'2016-07-01')]


In [41]:
# cos, sin , coh , sinh, tan, tanh
# lentgh(string_col) , size (array) , 

### b - rows

In [94]:
from pyspark.sql import Row
r=Row(name="Alice", age=1)
r

Row(age=1, name='Alice')

In [95]:
d =[r]
df=spark.createDataFrame(d)
df.printSchema()


root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [96]:
r=df_trs.first()
r

Row(transaction_id=0, date=u'2016-06-01', magasin=39, quantity=23, unit_price=12)

In [92]:
print r.asDict()
print r.index(39)
print r.count(12)

{'date': u'2016-06-01', 'quantity': 23, 'unit_price': 12, 'transaction_id': 0, 'magasin': 39}
2
1


In [100]:
import numpy as np

25.0

In [106]:
df_trs.filter('date="2016-06-01"').rdd.map(lambda x: x.asDict()).map(lambda x : (x,np.float(x['unit_price'])**3)).first()

({'date': u'2016-06-01',
  'magasin': 39,
  'quantity': 23,
  'transaction_id': 0,
  'unit_price': 12},
 1728.0)

In [107]:
df_trs.filter('date="2016-06-01"').rdd.map(lambda x : (x.asDict(),np.float(x.unit_price)**3)).first()

({'date': u'2016-06-01',
  'magasin': 39,
  'quantity': 23,
  'transaction_id': 0,
  'unit_price': 12},
 1728.0)

### b - columns

In [42]:
print F.col('quantity')
print df_trs['quantity']

Column<quantity>
Column<quantity>


In [43]:
F.col('non_existing_column')

Column<non_existing_column>

In [44]:
######  this raises error 
#df_trs['non_existing_column']

In [45]:
(F.col('quantity')*F.col('unit_price')).alias('CA')

Column<(quantity * unit_price) AS `CA`>

In [47]:

col=(F.col('quantity')*F.col('unit_price')).cast('int').alias('CA')
print col ,'\n'
print df_trs.select('*',col).first()

Column<CAST((quantity * unit_price) AS INT) AS `CA`> 

Row(transaction_id=0, date=u'2016-06-01', magasin=39, quantity=23, unit_price=12, CA=276)


In [68]:
col_boolean_1=col.isin({1,2,3,4,5,6,7,8,9}).alias('boolean_ca_filter_0_10')
print col_boolean_1 

Column<(CAST((quantity * unit_price) AS INT) AS `CA` IN (1, 2, 3, 4, 5, 6, 7, 8, 9)) AS `boolean_ca_filter_0_10`>


In [69]:

print df_trs.select('*',col).filter(col_boolean_1).first()
print df_trs.filter(col_boolean_1).first()

Row(transaction_id=4, date=u'2016-06-05', magasin=37, quantity=4, unit_price=2, CA=8)
Row(transaction_id=4, date=u'2016-06-05', magasin=37, quantity=4, unit_price=2)


In [70]:
col_boolean_2=col.between(10,20).alias('boolean_ca_filter_10_20')
print col_boolean_2 

Column<((CAST((quantity * unit_price) AS INT) AS `CA` >= 10) AND (CAST((quantity * unit_price) AS INT) AS `CA` <= 20)) AS `boolean_ca_filter_10_20`>


In [71]:

print df_trs.select('*',col).filter(col_boolean_2).first()
print df_trs.filter(col_boolean_2).first()

Row(transaction_id=98, date=u'2016-09-07', magasin=49, quantity=6, unit_price=3, CA=18)
Row(transaction_id=98, date=u'2016-09-07', magasin=49, quantity=6, unit_price=3)


In [72]:
col_sorting_expression=col.desc()

print col_sorting_expression ,'\n'

Column<CAST((quantity * unit_price) AS INT) AS `CA` DESC NULLS LAST> 



In [73]:

print df_trs.select('*',col).orderBy(col_sorting_expression).first()

Row(transaction_id=409, date=u'2016-11-23', magasin=48, quantity=98, unit_price=19, CA=1862)


In [74]:
df_trs.select('*',col).agg({'CA':'max'}).first()

Row(max(CA)=1862)

In [75]:
####other column functions
# alias , asc , astype , cast, between , bitwiseAND, bitwiseOR , desc , getField , endswith  , isNotNull , isNull , isin
# like, startswith , substr