### Pair RDD
* mapValue()
* countByKey()
* groupByKey()
* reduceByKey()
* aggregateByKey()


In [1]:
carros = sc.textFile("data/carros.csv")

In [2]:
carros.take(3)

['MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE',
 'subaru,gas,std,two,hatchback,fwd,four,69,4900,31,36,5118',
 'chevrolet,gas,std,two,hatchback,fwd,three,48,5100,47,53,5151']

In [3]:
# criacao de key-value
carrosPairRDD = carros.map(lambda x: (x.split(",")[0], x.split(",")[7]))
carrosPairRDD.take(3)

[('MAKE', 'HP'), ('subaru', '69'), ('chevrolet', '48')]

In [4]:
# removendo o cabecalho do key-value
header = carrosPairRDD.first()
carrosPairRDD2 = carrosPairRDD.filter(lambda line: line != header)
carrosPairRDD2.take(2)

[('subaru', '69'), ('chevrolet', '48')]

In [5]:
# adicionando contador quantitativo ao valor = mapeia somente os values da key-value
carrosPairRDD3 = carrosPairRDD2.mapValues(lambda x: (int(x), 1))
carrosPairRDD3.take(3)

[('subaru', (69, 1)), ('chevrolet', (48, 1)), ('mazda', (68, 1))]

In [6]:
# reducao[0] = soma de HP = x1[0] + x2[0]
# reducao[1] = total de automoveis = x1[1] + x2[1])

fabricantes = carrosPairRDD3.reduceByKey(lambda x1, x2: (x1[0] + x2[0], x1[1] + x2[1]))
fabricantes.collect()

[('chevrolet', (188, 3)),
 ('mazda', (1390, 16)),
 ('mitsubishi', (1353, 13)),
 ('nissan', (1846, 18)),
 ('dodge', (675, 8)),
 ('plymouth', (607, 7)),
 ('saab', (760, 6)),
 ('volvo', (1408, 11)),
 ('alfa-romero', (376, 3)),
 ('mercedes-benz', (1170, 8)),
 ('jaguar', (614, 3)),
 ('subaru', (1035, 12)),
 ('toyota', (2969, 32)),
 ('honda', (1043, 13)),
 ('isuzu', (168, 2)),
 ('volkswagen', (973, 12)),
 ('peugot', (1098, 11)),
 ('audi', (687, 6)),
 ('bmw', (1111, 8)),
 ('mercury', (175, 1)),
 ('porsche', (764, 4))]

In [7]:
# calculando a m√©dia de HP por fabricante
fabricantes.mapValues(lambda x : round(x[0] / x[1])).collect()

[('chevrolet', 63),
 ('mazda', 87),
 ('mitsubishi', 104),
 ('nissan', 103),
 ('dodge', 84),
 ('plymouth', 87),
 ('saab', 127),
 ('volvo', 128),
 ('alfa-romero', 125),
 ('mercedes-benz', 146),
 ('jaguar', 205),
 ('subaru', 86),
 ('toyota', 93),
 ('honda', 80),
 ('isuzu', 84),
 ('volkswagen', 81),
 ('peugot', 100),
 ('audi', 114),
 ('bmw', 139),
 ('mercury', 175),
 ('porsche', 191)]

In [8]:
fabricantes.saveAsPickleFile("data/fabricantes.pkl")