In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
sc = SparkContext()
spark = SparkSession(sparkContext=sc)

## Aggregate functions:
* aggregate()
* aggregateByKey()

### aggregate(zeroValue, seqOp, combOp)
* zeroValue is like a data container. Its structure should match with the data structure of the returned values from the seqOp function.
* seqOp is a function that takes two arguments: the first argument is the zeroValue and the second argument is an element from the RDD. The zeroValue gets updated with the returned value after every run.
* combOp is a function that takes two arguments: the first argument is the final zeroValue from one partition, and the other is another final zeroValue from another partition.

In [4]:
mtcars_df = spark.read.csv("../../data/mtcars.csv",
                          #inferSchmea=True,
                          header=True).select(['mpg','disp'])
mtcars_df.take(5)

[Row(mpg='21', disp='160'),
 Row(mpg='21', disp='160'),
 Row(mpg='22.8', disp='108'),
 Row(mpg='21.4', disp='258'),
 Row(mpg='18.7', disp='360')]

In [5]:
## Calculate averages of mgp and disp

In [9]:
mpg_mean = mtcars_df.select('mpg').rdd.map(lambda x: float(x[0])).mean()
disp_mean = mtcars_df.select('disp').rdd.map(lambda x: float(x[0])).mean()
print(mpg_mean)
print(disp_mean)

20.090625000000003
230.721875


In [10]:
## Build zeroValue, seqOp and combOp

In [11]:
zeroValue = (0,0)

z below refers to zeroValues. Its values get updated after every run. The x refers to an element in an RDD partition. 

In [12]:
seqOp = lambda z,x: (z[0]+(x[0] - mpg_mean)**2,z[1]+(x[1]-disp_mean)**2)

combOp simply aggregate all zeroValues into one

In [13]:
combOp = lambda px,py:(px[0]+py[0],px[1]+py[1])

In [None]:
# implement aggregate
mtcars_df.rdd.aggregate(zeroValue, seqOp, combOp)

## aggregateByKey(zeroValue,seqOp,combOp)

In [16]:
iris_rdd = sc.textFile('../../data/iris.csv',use_unicode=True)
iris_rdd.take(2)

['sepal_length,sepal_width,petal_length,petal_width,species',
 '5.1,3.5,1.4,0.2,setosa']

In [17]:
# Transform data to a tuple RDD

In [18]:
iris_rdd_2 = iris_rdd.map(lambda x: x.split(',')).\
            filter(lambda x: x[0] != 'sepal_length').\
            map(lambda x: (x[-1],[*map(float,x[:-1])]))

In [19]:
iris_rdd_2.take(5)

[('setosa', [5.1, 3.5, 1.4, 0.2]),
 ('setosa', [4.9, 3.0, 1.4, 0.2]),
 ('setosa', [4.7, 3.2, 1.3, 0.2]),
 ('setosa', [4.6, 3.1, 1.5, 0.2]),
 ('setosa', [5.0, 3.6, 1.4, 0.2])]

In [20]:
# Define initial values, seqOp and combOp

In [21]:
zero_Value = (0,0)
seqOp = (lambda x,y: (x[0]+(y[0])**2,x[1]+(y[1])**2))
combOp = (lambda x,y: (x[0]+y[0],x[1]+y[1]))

In [22]:
iris_rdd_2.aggregateByKey(zero_Value,seqOp, combOp).collect()

[('setosa', (1259.0899999999997, 591.2500000000002)),
 ('versicolor', (1774.8600000000001, 388.47)),
 ('virginica', (2189.9000000000005, 447.33))]