In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession

In [3]:
sc = SparkContext()
spark = SparkSession(sparkContext=sc)

## Map Function
Most frequently used are as follows:
* map()
* mapValues()
* flatMap()
* flatMapValues()

In [5]:
#Create rdd
map_exp_rdd = sc.textFile('../../data/mtcars.csv')
map_exp_rdd.take(5)

[',mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb',
 'Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4',
 'Mazda RX4 Wag,21,6,160,110,3.9,2.875,17.02,0,1,4,4',
 'Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1',
 'Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1']

In [7]:
#Split auto model from other feature values
map_exp_rdd_1 = map_exp_rdd.map(lambda x:x.split(",")).map(lambda x: (x[0],x[1:]))
map_exp_rdd_1.take(5)

[('',
  ['mpg',
   'cyl',
   'disp',
   'hp',
   'drat',
   'wt',
   'qsec',
   'vs',
   'am',
   'gear',
   'carb']),
 ('Mazda RX4',
  ['21', '6', '160', '110', '3.9', '2.62', '16.46', '0', '1', '4', '4']),
 ('Mazda RX4 Wag',
  ['21', '6', '160', '110', '3.9', '2.875', '17.02', '0', '1', '4', '4']),
 ('Datsun 710',
  ['22.8', '4', '108', '93', '3.85', '2.32', '18.61', '1', '1', '4', '1']),
 ('Hornet 4 Drive',
  ['21.4', '6', '258', '110', '3.08', '3.215', '19.44', '1', '0', '3', '1'])]

In [9]:
#remove the header
header = map_exp_rdd_1.first()

map_exp_rdd_2 = map_exp_rdd_1.filter(lambda x: x!= header)
map_exp_rdd_2.take(5)

[('Mazda RX4',
  ['21', '6', '160', '110', '3.9', '2.62', '16.46', '0', '1', '4', '4']),
 ('Mazda RX4 Wag',
  ['21', '6', '160', '110', '3.9', '2.875', '17.02', '0', '1', '4', '4']),
 ('Datsun 710',
  ['22.8', '4', '108', '93', '3.85', '2.32', '18.61', '1', '1', '4', '1']),
 ('Hornet 4 Drive',
  ['21.4', '6', '258', '110', '3.08', '3.215', '19.44', '1', '0', '3', '1']),
 ('Hornet Sportabout',
  ['18.7', '8', '360', '175', '3.15', '3.44', '17.02', '0', '0', '3', '2'])]

In [12]:
#convert string to numeric type
map_exp_rdd_3 = map_exp_rdd_2.map(lambda x:(x[0],list(map(float,x[1]))))
map_exp_rdd_3.take(5)

[('Mazda RX4',
  [21.0, 6.0, 160.0, 110.0, 3.9, 2.62, 16.46, 0.0, 1.0, 4.0, 4.0]),
 ('Mazda RX4 Wag',
  [21.0, 6.0, 160.0, 110.0, 3.9, 2.875, 17.02, 0.0, 1.0, 4.0, 4.0]),
 ('Datsun 710',
  [22.8, 4.0, 108.0, 93.0, 3.85, 2.32, 18.61, 1.0, 1.0, 4.0, 1.0]),
 ('Hornet 4 Drive',
  [21.4, 6.0, 258.0, 110.0, 3.08, 3.215, 19.44, 1.0, 0.0, 3.0, 1.0]),
 ('Hornet Sportabout',
  [18.7, 8.0, 360.0, 175.0, 3.15, 3.44, 17.02, 0.0, 0.0, 3.0, 2.0])]

## mapValues
Requires that the RDD has a key/value pair structure. The mapValues applies function to each of the values, and the key will remain unchanged.

In [13]:
mapValues_exp_rdd = map_exp_rdd_3
mapValues_exp_rdd.take(5)

[('Mazda RX4',
  [21.0, 6.0, 160.0, 110.0, 3.9, 2.62, 16.46, 0.0, 1.0, 4.0, 4.0]),
 ('Mazda RX4 Wag',
  [21.0, 6.0, 160.0, 110.0, 3.9, 2.875, 17.02, 0.0, 1.0, 4.0, 4.0]),
 ('Datsun 710',
  [22.8, 4.0, 108.0, 93.0, 3.85, 2.32, 18.61, 1.0, 1.0, 4.0, 1.0]),
 ('Hornet 4 Drive',
  [21.4, 6.0, 258.0, 110.0, 3.08, 3.215, 19.44, 1.0, 0.0, 3.0, 1.0]),
 ('Hornet Sportabout',
  [18.7, 8.0, 360.0, 175.0, 3.15, 3.44, 17.02, 0.0, 0.0, 3.0, 2.0])]

In [14]:
import numpy as np
mapValues_exp_rdd_1 = mapValues_exp_rdd.mapValues(lambda x: np.mean(x))
mapValues_exp_rdd_1.take(5)

[('Mazda RX4', 29.90727272727273),
 ('Mazda RX4 Wag', 29.98136363636364),
 ('Datsun 710', 23.59818181818182),
 ('Hornet 4 Drive', 38.73954545454546),
 ('Hornet Sportabout', 53.66454545454546)]

## flatMap
This function first applies a function to each elements of an RDD and then flatten the results.

In [16]:
x = [('a','b','c'),('a','a'),('c','c','c','d')]

flatMap_exp_rdd = sc.parallelize(x)
flatMap_exp_rdd.collect()

[('a', 'b', 'c'), ('a', 'a'), ('c', 'c', 'c', 'd')]

In [17]:
flatMap_exp_rdd_1 = flatMap_exp_rdd.flatMap(lambda x:x)
flatMap_exp_rdd_1.collect()

['a', 'b', 'c', 'a', 'a', 'c', 'c', 'c', 'd']

## flatMapValues
Requires that each RDD has a key/value pair structure. It applies a function to each element value of the RDD and flatten the results.

In [18]:
my_data = [
    [1,(23,28,32)],
    [2,(18,29,31)],
    [3,(34,21,18)]
]

flatMapValues_exp_rdd = sc.parallelize(my_data)
flatMapValues_exp_rdd.collect()

[[1, (23, 28, 32)], [2, (18, 29, 31)], [3, (34, 21, 18)]]

In [19]:
# Merge A,B, and C columns into one column and add the type column

In [21]:
flatMapValues_exp_rdd_1 = flatMapValues_exp_rdd.flatMapValues(lambda x:list(zip(list("ABC"),x)))
flatMapValues_exp_rdd_1.collect()

[(1, ('A', 23)),
 (1, ('B', 28)),
 (1, ('C', 32)),
 (2, ('A', 18)),
 (2, ('B', 29)),
 (2, ('C', 31)),
 (3, ('A', 34)),
 (3, ('B', 21)),
 (3, ('C', 18))]

In [22]:
# Unpack the element values
flatMapValues_exp_rdd_2 = flatMapValues_exp_rdd_1.map(lambda x: [x[0]]+list(x[1]))
flatMapValues_exp_rdd_2.collect()

[[1, 'A', 23],
 [1, 'B', 28],
 [1, 'C', 32],
 [2, 'A', 18],
 [2, 'B', 29],
 [2, 'C', 31],
 [3, 'A', 34],
 [3, 'B', 21],
 [3, 'C', 18]]