## CARLOS SÁNCHEZ VEGA

# RDD EXERCISES

We import libraries:

In [2]:
from pyspark import SparkConf, SparkContext
import collections

We create the spark configuration:

In [21]:
conf = SparkConf().setMaster("local").setAppName("RDDsExercises")
sc = SparkContext(conf = conf)

In [23]:
lines = sc.textFile("u.data")

In [24]:
for i in lines.take(10):print(i)

196	242	3	881250949
186	302	3	891717742
22	377	1	878887116
244	51	2	880606923
166	346	1	886397596
298	474	4	884182806
115	265	2	881171488
253	465	5	891628467
305	451	3	886324817
6	86	3	883603013


1. <strong>Question: taking into account the previous RDD, get the number of times a mark (3 column) is given</strong>

In [27]:
ratings = lines.map(lambda l: int(l.split()[2]))

The are asking us to get the number of occurrences of a mark in the RDD, so we could use, for example:

In [28]:
results = ratings.countByValue()

In [32]:
print(results.values())

dict_values([27145, 6110, 11370, 34174, 21201])


Result is provided as a dict. We format the output:

In [44]:
for key, value in sortedResults.items():
    print("%s %i" % (key, value))

1 6110
2 11370
3 27145
4 34174
5 21201


In [45]:
sc.stop()

2. <strong>Get the average number of friends broken down by age</strong>

In [46]:
conf = SparkConf().setMaster("local").setAppName("RDD2")
sc = SparkContext(conf = conf)

we load data into an RDD:

In [47]:
lines = sc.textFile("fakefriends.csv")


In [48]:
for i in lines.take(10):print(i)

0,Will,33,385
1,Jean-Luc,26,2
2,Hugh,55,221
3,Deanna,40,465
4,Quark,68,21
5,Weyoun,59,318
6,Gowron,37,220
7,Will,54,307
8,Jadzia,38,380
9,Hugh,27,181


<ul>
<li>First column: identifier</li>
<li>Second column: name</li>
<li>Third column: age</li>
<li>Fourth column: number of friends</li>
</ul> 

We create a function to parse data:

In [49]:
def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

In [50]:
rdd = lines.map(parseLine)

We create a list of (age#friends) in our rdd to solve the problem:
We are going to split up the solution in two parts:
1) for every value, we will create a counter:
Say we have (33, 385) =>  For each pair, we will take (numberOfFriends, 1) being "1" somehow a counter . So, we will create the next structure:
(33, 385) =>  (33, (385,1))
So, the first part is:
rdd.mapValues(lambda x(x,1)

2) For the rdd, we will create a data structure (totalFriendsForThatAge, numberOfPeopleOfThatAge). Example:
(33,385) => (33, (385,1))

For every tuple:
(33,385) => (33, (385,1))
(33, (385,1)) + (33,2) => (385 +2, 1+1)   => (33, (387,2))
so we have to do (x[0] + y[0], x[1]+y[1]))

As we are grouping by this defined function, we must use the reduceByKey function

In [51]:
totalsByAge = rdd.mapValues(lambda x: (x,1)).reduceByKey(lambda x,y: (x[0] + y[0], x[1] + x[1]))

In [52]:
for i in totalsByAge.take(10):print(i)

(33, (3904, 2048))
(26, (4115, 65536))
(55, (3842, 4096))
(40, (4264, 65536))
(68, (2696, 512))
(59, (1980, 256))
(37, (2244, 256))
(54, (3615, 4096))
(38, (2903, 16384))
(27, (1825, 128))


Having this tuple:
    (33, (3904, 2048))
to calculate the average we must divide the number of total friends (3904) / total number of persons of that age (2048)
    

In [54]:
averagesByAge = totalsByAge.mapValues(lambda x: x[0] / x[1])
results = averagesByAge.collect()
for result in results:
    print(result)

(33, 1.90625)
(26, 0.0627899169921875)
(55, 0.93798828125)
(40, 0.0650634765625)
(68, 5.265625)
(59, 7.734375)
(37, 8.765625)
(54, 0.882568359375)
(38, 0.17718505859375)
(27, 14.2578125)
(53, 24.375)
(57, 1.5166015625)
(56, 57.5)
(43, 25.21875)
(36, 4.81640625)
(22, 22.578125)
(35, 13.2265625)
(45, 0.982421875)
(60, 22.171875)
(67, 0.10479736328125)
(19, 2.291015625)
(30, 2.533203125)
(51, 33.046875)
(25, 2.12109375)
(21, 21.9296875)
(42, 56.90625)
(49, 34.625)
(48, 5.49609375)
(50, 79.5625)
(39, 18.515625)
(32, 2.2333984375)
(58, 1.251953125)
(64, 1.6484375)
(31, 16.703125)
(52, 3.6591796875)
(24, 73.0625)
(20, 51.5625)
(62, 0.70068359375)
(41, 9.44140625)
(44, 1.6533203125)
(69, 4.59375)
(65, 93.1875)
(61, 9.0078125)
(28, 4.083984375)
(66, 9.71875)
(46, 0.7099609375)
(29, 1.26513671875)
(18, 21.4609375)
(47, 8.19921875)
(34, 46.03125)
(63, 192.0)
(23, 4.810546875)


Meaning that:
    (33, 1.90625)
For the age of 33, the average number of friends is 1.90625

In [6]:
sc.stop()

3. <strong>Get the min temperature for each station</strong>

In [7]:
conf = SparkConf().setMaster("local").setAppName("MinTemperatures")
sc = SparkContext(conf = conf)

In [8]:
lines = sc.textFile("1800.csv")

In [9]:
for i in lines.take(10):print(i)

ITE00100554,18000101,TMAX,-75,,,E,
ITE00100554,18000101,TMIN,-148,,,E,
GM000010962,18000101,PRCP,0,,,E,
EZE00100082,18000101,TMAX,-86,,,E,
EZE00100082,18000101,TMIN,-135,,,E,
ITE00100554,18000102,TMAX,-60,,I,E,
ITE00100554,18000102,TMIN,-125,,,E,
GM000010962,18000102,PRCP,0,,,E,
EZE00100082,18000102,TMAX,-44,,,E,
EZE00100082,18000102,TMIN,-130,,,E,


<ul>
<li>First column: station</li>
<li>Second column: code of the station</li>
<li>Third column: entry type (TMIN, TMAX...)</li>
<li>Fourth column: temperature</li>
</ul> 

With the purpose of formatting the data entry, we create the next function:

In [10]:
def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0
    return (stationID, entryType, temperature)

In [11]:
parsedLines = lines.map(parseLine)

As we have to get the min temperature, we filter entry type as "TMIN"

In [12]:
minTemps = parsedLines.filter(lambda x: "TMIN" in x[1])

Now, we will use the reduceByKey as the aggregate function to calculate the min:

In [66]:
stationTemps = minTemps.map(lambda x: (x[0], x[2]))

In [69]:
minTemps = stationTemps.reduceByKey(lambda x, y: max(x,y))
results = minTemps.collect()

for result in results:
    print(result[0] + "\t{:.2f}F".format(result[1]))

ITE00100554	90.14F
EZE00100082	90.14F


In [70]:
sc.stop()

4. <strong>Get the max temperature for each station</strong>

In [4]:
conf = SparkConf().setMaster("local").setAppName("MaxTemperatures")
sc = SparkContext(conf = conf)

In [13]:
maxTemps = parsedLines.filter(lambda x: "TMAX" in x[1])

In [14]:
stationMaxTemps = maxTemps.map(lambda m: (m[0],m[2]))

In [15]:
maxTemps = stationMaxTemps.reduceByKey(lambda x,y: max(x,y))

In [18]:
for i in maxTemps.take(10):print(i)

('ITE00100554', 90.14000000000001)
('EZE00100082', 90.14000000000001)


We format the result

In [21]:
results = maxTemps.collect()
for temp in results:
    print(temp[0] + "\t{:.2f}F".format(temp[1]))

ITE00100554	90.14F
EZE00100082	90.14F


In [22]:
sc.stop()

5. <strong>Get the word count in a book</strong>

In [4]:
conf = SparkConf().setMaster("local").setAppName("wordCount")
sc = SparkContext(conf = conf)

In [5]:
lines = sc.textFile("Book")

In [6]:
wordCount = lines.flatMap(lambda w: w.split()).map(lambda w: (w,1)).reduceByKey(lambda x,y:x+y)

In [7]:
for i in wordCount.take(10):print(i)

('Self-Employment:', 1)
('Building', 5)
('an', 172)
('Internet', 13)
('Business', 19)
('of', 941)
('One', 12)
('Achieving', 1)
('Financial', 3)
('and', 901)


In [15]:
for word, count in wordCount.collect()[:30]:
    cleanWord = word.encode('ascii', 'ignore')
    if (cleanWord):
        print(cleanWord.decode() + " " + str(count))

Self-Employment: 1
Building 5
an 172
Internet 13
Business 19
of 941
One 12
Achieving 1
Financial 3
and 901
Personal 3
Freedom 7
through 55
a 1148
Lifestyle 5
Technology 2
By 9
Frank 10
Kane 7
Copyright 1
2015 3
Kane. 1
All 13
rights 3
reserved 2
worldwide. 2
CONTENTS 1
Disclaimer 1
Preface 1


We format the putput:

We could also choose the next option to solve the problem:

In [22]:
words = lines.flatMap(lambda w: w.split()).countByValue()

In [23]:
for word, count in list(words.items())[0:30]:
    cleanWord = word.encode('ascii', 'ignore')
    if (cleanWord):
        print(cleanWord.decode() + " " + str(count))

Self-Employment: 1
Building 5
an 172
Internet 13
Business 19
of 941
One 12
Achieving 1
Financial 3
and 901
Personal 3
Freedom 7
through 55
a 1148
Lifestyle 5
Technology 2
By 9
Frank 10
Kane 7
Copyright 1
2015 3
Kane. 1
All 13
rights 3
reserved 2
worldwide. 2
CONTENTS 1
Disclaimer 1
Preface 1


As we showed in the output above, the result show was not totally correct aswords in capital letters are considered different from those in lower case letter. We could improve the result by creating a regular expression filtering the result we don't want to show

In [24]:
def normalizeWords(text):
    return re.compile(r'\W+', re.UNICODE).split(text.lower())

And we pass the previous function to the flatMap function

In [25]:
import re

words = lines.flatMap(normalizeWords)
wordCounts = words.countByValue()

5. <strong>Sort the result shown from the word count above</strong>

In [26]:
results = words.map(lambda w: (w,1)).reduceByKey(lambda x,y:x+y)

In [27]:
for i in results.take(10):print(i)

('self', 111)
('employment', 75)
('building', 33)
('an', 178)
('internet', 26)
('business', 383)
('of', 970)
('one', 100)
('achieving', 1)
('financial', 17)


In [28]:
results = words.map(lambda w: (w,1)).reduceByKey(lambda x,y:x+y).map(lambda w: (w[1],w[0])).sortByKey(False).map(lambda w: (w[1],w[0]))

Firstly, we create a map function so as to count the number of elements (reduceByKey). Then, we change the order of the elements in the tuple to use the "sortByKey" function. Finally, we re-order the elements (word, numberOccurrences)

In [29]:
for i in results.take(10):print(i)

('you', 1878)
('to', 1828)
('your', 1420)
('the', 1292)
('a', 1191)
('of', 970)
('and', 934)
('', 772)
('that', 747)
('it', 649)


We format the output

In [30]:
for word, count in results.collect()[:30]:
    cleanWord = word.encode('ascii', 'ignore')
    if (cleanWord):
        print(cleanWord.decode() + " " + str(count))

you 1878
to 1828
your 1420
the 1292
a 1191
of 970
and 934
that 747
it 649
in 616
is 560
for 537
on 428
are 424
if 411
s 391
i 387
business 383
can 376
be 369
as 343
have 321
with 315
t 301
this 280
or 278
time 255
but 242
they 234


In [31]:
sc.stop()

6. <strong>Get the total amount per customer</strong>

In [32]:
conf = SparkConf().setMaster("local").setAppName("AmountPerCustomer")
sc = SparkContext(conf = conf)

Taking into account the next data input

In [35]:
input = sc.textFile("customer-orders.csv")

In [36]:
for i in input.take(10):print(i)

44,8602,37.19
35,5368,65.89
2,3391,40.64
47,6694,14.98
29,680,13.08
91,8900,24.59
70,3959,68.68
85,1733,28.53
53,9900,83.55
14,1505,4.32


<ul>
<li>First column: customerId</li>
<li>Second column: orderId</li>
<li>Third column: price</li>
</ul> 

We create a function to parse input data

In [37]:
def extractCustomerPricePairs(line):
    fields = line.split(',')
    return (int(fields[0]), float(fields[2]))

In [38]:
mappedInput = input.map(extractCustomerPricePairs)

In [39]:
totalByCustomer = mappedInput.reduceByKey(lambda x, y: x + y)

In [40]:
flipped = totalByCustomer.map(lambda x: (x[1], x[0]))

In [41]:
totalByCustomerSorted = flipped.sortByKey()

In [42]:
results = totalByCustomerSorted.collect()[:30];
for result in results:
    print(str(result[1]) + "\t{:.2f}".format(result[0]))

45	3309.38
79	3790.57
96	3924.23
23	4042.65
99	4172.29
75	4178.50
36	4278.05
98	4297.26
47	4316.30
77	4327.73
13	4367.62
48	4384.33
49	4394.60
94	4475.57
67	4505.79
50	4517.27
78	4524.51
5	4561.07
57	4628.40
83	4635.80
91	4642.26
74	4647.13
84	4652.94
3	4659.63
12	4664.59
66	4681.92
56	4701.02
21	4707.41
80	4727.86
14	4735.03
