In [1]:
# Check if SparkContext is running.
print sc.version

2.0.1


In [2]:
# Get Spark configuration
sc._conf.getAll()

[(u'spark.driver.host', u'10.44.111.48'),
 (u'hive.metastore.warehouse.dir',
  u'/Users/adarshnair/spark-2.0.1-bin-hadoop2.7/spark_projects/MovieLens/spark-warehouse'),
 (u'spark.sql.catalogImplementation', u'hive'),
 (u'spark.rdd.compress', u'True'),
 (u'spark.serializer.objectStreamReset', u'100'),
 (u'spark.master', u'local[*]'),
 (u'spark.driver.port', u'52003'),
 (u'spark.executor.id', u'driver'),
 (u'spark.submit.deployMode', u'client'),
 (u'spark.app.name', u'PySparkShell'),
 (u'spark.app.id', u'local-1479407390195')]

In [13]:
'''
ratings_counter.py

This creates a histogram of the ratings data
'''
from pyspark import SparkConf, SparkContext
import collections

# conf = SparkConf().setMaster("local").setAppName("RatingsHistogram")

PATH = "/Users/adarshnair/spark-2.0.1-bin-hadoop2.7/spark_projects/MovieLens"

# Store the u.user data in user_data RDD
user_data = sc.textFile("%s/ml-100k/u.data" % PATH)
user_data.first()

u'196\t242\t3\t881250949'

In [15]:
'''
Get ratings info from u.data
'''
ratings = user_data.map(lambda x: x.split()[2])
result = ratings.countByValue()
result

defaultdict(int,
            {u'1': 6110, u'2': 11370, u'3': 27145, u'4': 34174, u'5': 21201})

In [16]:
'''
Order the ratings from 1 to 5
'''
sortedResults = collections.OrderedDict(sorted(result.items()))

# print out the values
for key, value in sortedResults.items():
    print("%s %i" % (key, value))

1 6110
2 11370
3 27145
4 34174
5 21201


In [20]:
'''
Map example
'''
rdd = sc.parallelize([1,2,3,4])
rdd = rdd.map(lambda x: x*x)
print rdd.collect()
'''
RDD actions

collect
count
countByValue
take
top

reduceByKey(): combine values with the same key using a fucntion
rdd.reduceByKey(lambda x,y: x + y) will add the values for a given key.

groupByKey(): group values with the same key
sortByKey(): sort RDD by key values
keys(), values(): create an RDD of just the keys, or just the values

mapValues()
flatMapValues()

'''

[1, 4, 9, 16]


### Getting the average number of friends by age for a fake social network ###

* mapValues
* reduceByKeys

In [21]:
'''
Read data in from CSV
'''
PATH = "/Users/adarshnair/spark-2.0.1-bin-hadoop2.7/spark_projects/MovieLens"
lines = sc.textFile("%s/fakefriends.csv" % PATH)
lines.first()

u'0,Will,33,385'

In [22]:
'''
Helper function to seperate out values and return key, value pair
'''
def parseLine(line):
    fields = line.split(',')
    age  = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

In [24]:
'''
Get totals by age
'''
rdd = lines.map(parseLine)

totalsByAge = rdd.mapValues(lambda x: (x, 1)) \ # (33, (385, 1)), (33, (2, 1))
                 .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))  # (33, (387, 2))
#reduceByKey(): combine values with the same key using a function
    
totalsByAge.collect()

[(18, (2747, 8)),
 (20, (825, 5)),
 (22, (1445, 7)),
 (24, (1169, 5)),
 (26, (4115, 17)),
 (28, (2091, 10)),
 (30, (2594, 11)),
 (32, (2287, 11)),
 (34, (1473, 6)),
 (36, (2466, 10)),
 (38, (2903, 15)),
 (40, (4264, 17)),
 (42, (1821, 6)),
 (44, (3386, 12)),
 (46, (2908, 13)),
 (48, (2814, 10)),
 (50, (1273, 5)),
 (52, (3747, 11)),
 (54, (3615, 13)),
 (56, (1840, 6)),
 (58, (1282, 11)),
 (60, (1419, 7)),
 (62, (2870, 13)),
 (64, (3376, 12)),
 (66, (2488, 9)),
 (68, (2696, 10)),
 (19, (2346, 11)),
 (21, (2807, 8)),
 (23, (2463, 10)),
 (25, (2172, 11)),
 (27, (1825, 8)),
 (29, (2591, 12)),
 (31, (2138, 8)),
 (33, (3904, 12)),
 (35, (1693, 8)),
 (37, (2244, 9)),
 (39, (1185, 7)),
 (41, (2417, 9)),
 (43, (1614, 7)),
 (45, (4024, 13)),
 (47, (2099, 9)),
 (49, (1108, 6)),
 (51, (2115, 7)),
 (53, (1560, 7)),
 (55, (3842, 13)),
 (57, (3106, 12)),
 (59, (1980, 9)),
 (61, (2306, 9)),
 (63, (1536, 4)),
 (65, (1491, 5)),
 (67, (3434, 16)),
 (69, (2352, 10))]

In [25]:
'''
Get average friends for each age
'''
averagesByAge = totalsByAge.mapValues(lambda x: x[0] / x[1])
averagesByAge.collect()

[(18, 343),
 (20, 165),
 (22, 206),
 (24, 233),
 (26, 242),
 (28, 209),
 (30, 235),
 (32, 207),
 (34, 245),
 (36, 246),
 (38, 193),
 (40, 250),
 (42, 303),
 (44, 282),
 (46, 223),
 (48, 281),
 (50, 254),
 (52, 340),
 (54, 278),
 (56, 306),
 (58, 116),
 (60, 202),
 (62, 220),
 (64, 281),
 (66, 276),
 (68, 269),
 (19, 213),
 (21, 350),
 (23, 246),
 (25, 197),
 (27, 228),
 (29, 215),
 (31, 267),
 (33, 325),
 (35, 211),
 (37, 249),
 (39, 169),
 (41, 268),
 (43, 230),
 (45, 309),
 (47, 233),
 (49, 184),
 (51, 302),
 (53, 222),
 (55, 295),
 (57, 258),
 (59, 220),
 (61, 256),
 (63, 384),
 (65, 298),
 (67, 214),
 (69, 235)]

### Getting the minimum temperature observed from each weather station ###

* filter()

In [30]:
'''
Load data
'''
PATH = "/Users/adarshnair/spark-2.0.1-bin-hadoop2.7/spark_projects/MovieLens"
lines = sc.textFile("%s/1800.csv" % PATH)
lines.collect()

[u'ITE00100554,18000101,TMAX,-75,,,E,',
 u'ITE00100554,18000101,TMIN,-148,,,E,',
 u'GM000010962,18000101,PRCP,0,,,E,',
 u'EZE00100082,18000101,TMAX,-86,,,E,',
 u'EZE00100082,18000101,TMIN,-135,,,E,',
 u'ITE00100554,18000102,TMAX,-60,,I,E,',
 u'ITE00100554,18000102,TMIN,-125,,,E,',
 u'GM000010962,18000102,PRCP,0,,,E,',
 u'EZE00100082,18000102,TMAX,-44,,,E,',
 u'EZE00100082,18000102,TMIN,-130,,,E,',
 u'ITE00100554,18000103,TMAX,-23,,,E,',
 u'ITE00100554,18000103,TMIN,-46,,I,E,',
 u'GM000010962,18000103,PRCP,4,,,E,',
 u'EZE00100082,18000103,TMAX,-10,,,E,',
 u'EZE00100082,18000103,TMIN,-73,,,E,',
 u'ITE00100554,18000104,TMAX,0,,,E,',
 u'ITE00100554,18000104,TMIN,-13,,,E,',
 u'GM000010962,18000104,PRCP,0,,,E,',
 u'EZE00100082,18000104,TMAX,-55,,,E,',
 u'EZE00100082,18000104,TMIN,-74,,,E,',
 u'ITE00100554,18000105,TMAX,10,,,E,',
 u'ITE00100554,18000105,TMIN,-6,,,E,',
 u'GM000010962,18000105,PRCP,0,,,E,',
 u'EZE00100082,18000105,TMAX,-40,,,E,',
 u'EZE00100082,18000105,TMIN,-58,,,E,',
 u'ITE00

In [28]:
'''
Helper function to split data and take 3 values from each row 
'''
def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0/5.0) + 32.0
    return (stationID, entryType, temperature)

In [31]:
parsedLines = lines.map(parseLine)
parsedLines.collect()

[(u'ITE00100554', u'TMAX', 18.5),
 (u'ITE00100554', u'TMIN', 5.359999999999999),
 (u'GM000010962', u'PRCP', 32.0),
 (u'EZE00100082', u'TMAX', 16.52),
 (u'EZE00100082', u'TMIN', 7.699999999999999),
 (u'ITE00100554', u'TMAX', 21.2),
 (u'ITE00100554', u'TMIN', 9.5),
 (u'GM000010962', u'PRCP', 32.0),
 (u'EZE00100082', u'TMAX', 24.08),
 (u'EZE00100082', u'TMIN', 8.599999999999998),
 (u'ITE00100554', u'TMAX', 27.86),
 (u'ITE00100554', u'TMIN', 23.72),
 (u'GM000010962', u'PRCP', 32.72),
 (u'EZE00100082', u'TMAX', 30.2),
 (u'EZE00100082', u'TMIN', 18.86),
 (u'ITE00100554', u'TMAX', 32.0),
 (u'ITE00100554', u'TMIN', 29.66),
 (u'GM000010962', u'PRCP', 32.0),
 (u'EZE00100082', u'TMAX', 22.1),
 (u'EZE00100082', u'TMIN', 18.68),
 (u'ITE00100554', u'TMAX', 33.8),
 (u'ITE00100554', u'TMIN', 30.919999999999998),
 (u'GM000010962', u'PRCP', 32.0),
 (u'EZE00100082', u'TMAX', 24.8),
 (u'EZE00100082', u'TMIN', 21.56),
 (u'ITE00100554', u'TMAX', 34.34),
 (u'ITE00100554', u'TMIN', 34.34),
 (u'GM000010962', u

In [32]:
'''
Use filter to only consider TMIN values
'''
minTemps = parsedLines.filter(lambda x: 'TMIN' in x[1])
minTemps.collect()

[(u'ITE00100554', u'TMIN', 5.359999999999999),
 (u'EZE00100082', u'TMIN', 7.699999999999999),
 (u'ITE00100554', u'TMIN', 9.5),
 (u'EZE00100082', u'TMIN', 8.599999999999998),
 (u'ITE00100554', u'TMIN', 23.72),
 (u'EZE00100082', u'TMIN', 18.86),
 (u'ITE00100554', u'TMIN', 29.66),
 (u'EZE00100082', u'TMIN', 18.68),
 (u'ITE00100554', u'TMIN', 30.919999999999998),
 (u'EZE00100082', u'TMIN', 21.56),
 (u'ITE00100554', u'TMIN', 34.34),
 (u'EZE00100082', u'TMIN', 21.740000000000002),
 (u'ITE00100554', u'TMIN', 33.8),
 (u'EZE00100082', u'TMIN', 23.0),
 (u'ITE00100554', u'TMIN', 34.52),
 (u'EZE00100082', u'TMIN', 26.42),
 (u'ITE00100554', u'TMIN', 36.14),
 (u'EZE00100082', u'TMIN', 23.72),
 (u'ITE00100554', u'TMIN', 37.58),
 (u'EZE00100082', u'TMIN', 18.5),
 (u'ITE00100554', u'TMIN', 39.38),
 (u'EZE00100082', u'TMIN', 20.84),
 (u'ITE00100554', u'TMIN', 37.22),
 (u'EZE00100082', u'TMIN', 21.2),
 (u'ITE00100554', u'TMIN', 34.34),
 (u'EZE00100082', u'TMIN', 21.2),
 (u'ITE00100554', u'TMIN', 36.14),


In [33]:
'''
Create RDD of the form (stationID, TMIN)
'''
stationTemps = minTemps.map(lambda x: (x[0], x[2]))
stationTemps.collect()

[(u'ITE00100554', 5.359999999999999),
 (u'EZE00100082', 7.699999999999999),
 (u'ITE00100554', 9.5),
 (u'EZE00100082', 8.599999999999998),
 (u'ITE00100554', 23.72),
 (u'EZE00100082', 18.86),
 (u'ITE00100554', 29.66),
 (u'EZE00100082', 18.68),
 (u'ITE00100554', 30.919999999999998),
 (u'EZE00100082', 21.56),
 (u'ITE00100554', 34.34),
 (u'EZE00100082', 21.740000000000002),
 (u'ITE00100554', 33.8),
 (u'EZE00100082', 23.0),
 (u'ITE00100554', 34.52),
 (u'EZE00100082', 26.42),
 (u'ITE00100554', 36.14),
 (u'EZE00100082', 23.72),
 (u'ITE00100554', 37.58),
 (u'EZE00100082', 18.5),
 (u'ITE00100554', 39.38),
 (u'EZE00100082', 20.84),
 (u'ITE00100554', 37.22),
 (u'EZE00100082', 21.2),
 (u'ITE00100554', 34.34),
 (u'EZE00100082', 21.2),
 (u'ITE00100554', 36.14),
 (u'EZE00100082', 25.7),
 (u'ITE00100554', 39.38),
 (u'EZE00100082', 27.86),
 (u'ITE00100554', 39.38),
 (u'EZE00100082', 25.34),
 (u'ITE00100554', 40.64),
 (u'EZE00100082', 25.7),
 (u'ITE00100554', 40.28),
 (u'EZE00100082', 33.62),
 (u'ITE0010

In [34]:
'''
Get min temperature recorded at each station
'''
minTempStation = stationTemps.reduceByKey(lambda x,y: min(x,y))
minTempStation.collect()

[(u'ITE00100554', 5.359999999999999), (u'EZE00100082', 7.699999999999999)]

In [36]:
'''
Get max temperature recorded at each station
'''
maxTemps = parsedLines.filter(lambda x: 'TMAX' in x[1])
stationTemps = maxTemps.map(lambda x: (x[0], x[2]))
maxTempStation = stationTemps.reduceByKey(lambda x,y: max(x,y))
maxTempStation.collect()

[(u'ITE00100554', 90.14000000000001), (u'EZE00100082', 90.14000000000001)]

### A: Counting the number of words in a text file ###

* map(): will transform each element in the RDD to one new element, there is a 1:1 correspondence. 
* flatMap(): will transform each element in the RDD to multiple elements

In [37]:
'''
Read data in from CSV
'''
PATH = "/Users/adarshnair/spark-2.0.1-bin-hadoop2.7/spark_projects/MovieLens"
lines = sc.textFile("%s/book.txt" % PATH)
lines.first()

u'Self-Employment: Building an Internet Business of One'

In [38]:
'''
Split on words
'''
words = lines.flatMap(lambda x: x.split())
words.collect()

[u'Self-Employment:',
 u'Building',
 u'an',
 u'Internet',
 u'Business',
 u'of',
 u'One',
 u'Achieving',
 u'Financial',
 u'and',
 u'Personal',
 u'Freedom',
 u'through',
 u'a',
 u'Lifestyle',
 u'Technology',
 u'Business',
 u'By',
 u'Frank',
 u'Kane',
 u'Copyright',
 u'\ufffd',
 u'2015',
 u'Frank',
 u'Kane.',
 u'All',
 u'rights',
 u'reserved',
 u'worldwide.',
 u'CONTENTS',
 u'Disclaimer',
 u'Preface',
 u'Part',
 u'I:',
 u'Making',
 u'the',
 u'Big',
 u'Decision',
 u'Overcoming',
 u'Inertia',
 u'Fear',
 u'of',
 u'Failure',
 u'Career',
 u'Indoctrination',
 u'The',
 u'Carrot',
 u'on',
 u'a',
 u'Stick',
 u'Ego',
 u'Protection',
 u'Your',
 u'Employer',
 u'as',
 u'a',
 u'Security',
 u'Blanket',
 u'Why',
 u'it\ufffds',
 u'Worth',
 u'it',
 u'Unlimited',
 u'Growth',
 u'Potential',
 u'Investing',
 u'in',
 u'Yourself,',
 u'Not',
 u'Someone',
 u'Else',
 u'No',
 u'Dependencies',
 u'No',
 u'Commute',
 u'Freedom',
 u'to',
 u'Live',
 u'Where',
 u'You',
 u'Want',
 u'Freedom',
 u'to',
 u'Work',
 u'When',
 u

In [39]:
'''
Count words
'''
wordCounts = words.countByValue()
for word, count in wordCounts.items():
    cleanWord = word.encode('ascii', 'ignore')
    if cleanWord:
        print cleanWord.decode() + '-' + str(count) 

considered.-1
considered?-1
Does-2
ocean,-1
Dartmouth,-1
hanging-1
payoff-2
increase-9
entirely-1
LAST-1
Discarding-1
oceans-1
advice.-1
advice,-7
ever-improving-1
writing,-1
basics-2
job),-1
TELLING-1
AdSense,-1
non-compete-2
Conventional-1
27,-1
second-3
sales?-1
Kane,-2
Kane.-1
(Obamacare,)-1
sales.-6
sales,-9
high-dollar-1
designing-2
"moderately-1
evaluating-2
resignation.-1
leaning-2
here-11
careers.-1
Rumsey,-1
affiliated-1
Insider11.-1
reasons:-1
separation-1
kids-5
reports-2
Choosing-1
NOT-2
military-2
interesting?-1
criticism-1
Three-2
replace-4
brought-1
remnant-1
anywhere.-1
stern-1
"The-4
cheating-1
revenue,-2
concept!-1
revenue.-4
catchy-1
revenue!-1
therefore-2
strike-1
until-15
ability.-1
paperwork-2
relax-2
successful-14
brings-3
tying-1
hold-1
circumstances-1
pursue-1
liability,-1
placements.-1
marketer,-1
placements,-1
example-3
minimize.-1
household-1
Reality-1
caution-2
reviewing-1
want-99
minimum,-4
fares?-1
minimum.-3
medication?-1
statistics,-1
DPI-1
around.-1
a

### B: Counting the words in a text file removing punctuation ###

In [43]:
'''
Helper function to normalize words
'''
import re
def normalizeWords(text):
    return re.compile(r'\W+', re.UNICODE).split(text.lower())

In [44]:
words = lines.flatMap(normalizeWords)
wordCounts = words.countByValue()

for word, count in wordCounts.items():
    cleanWord = word.encode('ascii', 'ignore')
    if (cleanWord):
        print(cleanWord.decode() + " " + str(count))

limited 12
transitions 1
similarity 1
hats 1
child 1
desirable 1
chaos 2
four 1
immersive 1
sleep 5
preface 2
ridiculous 1
saved 6
payoff 2
relationships 1
looking 13
violate 1
crises 1
disability 2
extrapolate 1
swat 1
under 8
worth 39
saves 1
risk 18
unsold 1
blanket 2
oceans 4
handling 2
every 32
updates 3
kick 1
affect 9
adaptive 1
vast 2
school 3
basics 12
frankly 1
skills 17
fondly 1
companies 31
unrelated 1
math 3
shows 3
straight 2
whirlwind 1
enjoy 13
force 1
disciplines 2
consistent 1
hanging 2
estimates 3
direct 13
complement 1
budget 12
surrounding 3
electricians 1
street 2
iraq 2
estimated 2
even 104
established 2
errors 1
decisions 5
selected 1
near 5
asia 1
bonuses 2
above 5
designing 4
new 153
net 13
conjure 1
ever 10
told 6
specialist 1
niche 20
evaluating 4
mailings 1
never 27
disposable 1
here 29
hundreds 5
refinement 2
protection 7
china 4
pursuit 1
active 4
path 19
100 10
cardboard 1
104 1
obtained 1
michal 3
kids 7
forum 1
items 6
employees 10
changed 2
completely

### C: Counting the words in a text file removing punctuation and sorting by key###

* sortByKey()

In [54]:
words = lines.flatMap(normalizeWords)
wordCounts = words.map(lambda x: (x,1)).reduceByKey(lambda x, y: x + y)
'''
Flip (key, value) -> (word, count) pairs to (value, key) -> (count, word) so we can use sortByKey()
'''
wordCountsSorted = wordCounts.map(lambda (x,y): (y,x)).sortByKey()
result = wordCountsSorted.collect()
result

[(1, u'hats'),
 (1, u'desirable'),
 (1, u'four'),
 (1, u'relationships'),
 (1, u'extrapolate'),
 (1, u'adaptive'),
 (1, u'fondly'),
 (1, u'announced'),
 (1, u'force'),
 (1, u'consistent'),
 (1, u'electricians'),
 (1, u'asia'),
 (1, u'mailings'),
 (1, u'disposable'),
 (1, u'104'),
 (1, u'obtained'),
 (1, u'forum'),
 (1, u'harass'),
 (1, u'calculations'),
 (1, u'astute'),
 (1, u'criticism'),
 (1, u'psychological'),
 (1, u'conflicts'),
 (1, u'achievable'),
 (1, u'counts'),
 (1, u'landscape'),
 (1, u'catchy'),
 (1, u'hospital'),
 (1, u'strike'),
 (1, u'breathe'),
 (1, u'award'),
 (1, u'93'),
 (1, u'outdated'),
 (1, u'divorced'),
 (1, u'pursue'),
 (1, u'roof'),
 (1, u'exceptions'),
 (1, u'addictive'),
 (1, u'household'),
 (1, u'appointment'),
 (1, u'machine'),
 (1, u'hot'),
 (1, u'fudge'),
 (1, u'mouths'),
 (1, u'curiosity'),
 (1, u'chump'),
 (1, u'types'),
 (1, u'attempt'),
 (1, u'attracts'),
 (1, u'teammates'),
 (1, u'goodbye'),
 (1, u'realizing'),
 (1, u'salon'),
 (1, u'rewarded'),
 (1, 

## Exercise ##

Find amount spent by each customer(group by customerID). Data of the form:

customerID, itemID, amount

In [55]:
'''
Load data
'''
PATH = "/Users/adarshnair/spark-2.0.1-bin-hadoop2.7/spark_projects/MovieLens"
lines = sc.textFile("%s/customer-orders.csv" % PATH)
lines.first()

u'44,8602,37.19'

In [65]:
# price_count = lines.map(lambda x: (x,1))
# price_count.collect()
'''
Get relevant data
'''
def extract(line):
    fields = line.split(',')
    return (int(fields[0]), float(fields[2]))

In [66]:
data = lines.map(extract)
data.collect()

[(44, 37.19),
 (35, 65.89),
 (2, 40.64),
 (47, 14.98),
 (29, 13.08),
 (91, 24.59),
 (70, 68.68),
 (85, 28.53),
 (53, 83.55),
 (14, 4.32),
 (51, 19.8),
 (42, 57.77),
 (2, 55.77),
 (79, 33.17),
 (50, 23.57),
 (20, 6.49),
 (15, 65.53),
 (44, 99.19),
 (5, 64.18),
 (48, 32.42),
 (47, 25.66),
 (31, 4.16),
 (4, 24.13),
 (36, 88.64),
 (57, 57.91),
 (12, 72.62),
 (22, 56.06),
 (54, 28.01),
 (0, 97.22),
 (88, 80.7),
 (86, 71.9),
 (13, 26.47),
 (88, 29.76),
 (42, 22.98),
 (40, 72.95),
 (48, 27.06),
 (98, 86.56),
 (2, 20.31),
 (51, 84.57),
 (91, 64.42),
 (14, 77.77),
 (55, 40.78),
 (95, 60.54),
 (61, 3.92),
 (98, 30.71),
 (54, 30.76),
 (27, 1.17),
 (78, 16.96),
 (83, 56.68),
 (6, 42.3),
 (26, 24.07),
 (75, 61.0),
 (25, 59.98),
 (71, 93.3),
 (39, 60.62),
 (47, 2.86),
 (60, 41.99),
 (4, 24.66),
 (50, 12.99),
 (86, 34.38),
 (97, 9.35),
 (7, 40.34),
 (21, 45.77),
 (75, 56.05),
 (69, 0.82),
 (40, 85.49),
 (37, 91.75),
 (1, 9.99),
 (64, 52.91),
 (82, 87.17),
 (72, 30.65),
 (36, 53.92),
 (78, 19.87),
 (9

In [69]:
'''
Get total spent grouped by customerID
'''
total = data.reduceByKey(lambda x, y: x + y)
total.collect()

[(0, 5524.949999999999),
 (2, 5994.59),
 (4, 4815.050000000001),
 (6, 5397.879999999999),
 (8, 5517.24),
 (10, 4819.7),
 (12, 4664.589999999999),
 (14, 4735.030000000001),
 (16, 4979.06),
 (18, 4921.269999999999),
 (20, 4836.860000000001),
 (22, 5019.449999999999),
 (24, 5259.92),
 (26, 5250.4),
 (28, 5000.709999999999),
 (30, 4990.72),
 (32, 5496.05),
 (34, 5330.8),
 (36, 4278.049999999999),
 (38, 4898.460000000001),
 (40, 5186.429999999999),
 (42, 5696.840000000002),
 (44, 4756.890000000001),
 (46, 5963.109999999999),
 (48, 4384.33),
 (50, 4517.2699999999995),
 (52, 5245.0599999999995),
 (54, 6065.390000000001),
 (56, 4701.02),
 (58, 5437.73),
 (60, 5040.709999999999),
 (62, 5253.3200000000015),
 (64, 5288.689999999999),
 (66, 4681.92),
 (68, 6375.450000000001),
 (70, 5368.249999999999),
 (72, 5337.439999999999),
 (74, 4647.130000000001),
 (76, 4904.210000000001),
 (78, 4524.51),
 (80, 4727.86),
 (82, 4812.49),
 (84, 4652.9400000000005),
 (86, 4908.810000000001),
 (88, 4830.55),
 (90

In [70]:
'''
Sort values by amount spent. First flip the (id, amount) to (amount, id)
'''
flipped = total.map(lambda x: (x[1], x[0]))
flipped.collect()

[(5524.949999999999, 0),
 (5994.59, 2),
 (4815.050000000001, 4),
 (5397.879999999999, 6),
 (5517.24, 8),
 (4819.7, 10),
 (4664.589999999999, 12),
 (4735.030000000001, 14),
 (4979.06, 16),
 (4921.269999999999, 18),
 (4836.860000000001, 20),
 (5019.449999999999, 22),
 (5259.92, 24),
 (5250.4, 26),
 (5000.709999999999, 28),
 (4990.72, 30),
 (5496.05, 32),
 (5330.8, 34),
 (4278.049999999999, 36),
 (4898.460000000001, 38),
 (5186.429999999999, 40),
 (5696.840000000002, 42),
 (4756.890000000001, 44),
 (5963.109999999999, 46),
 (4384.33, 48),
 (4517.2699999999995, 50),
 (5245.0599999999995, 52),
 (6065.390000000001, 54),
 (4701.02, 56),
 (5437.73, 58),
 (5040.709999999999, 60),
 (5253.3200000000015, 62),
 (5288.689999999999, 64),
 (4681.92, 66),
 (6375.450000000001, 68),
 (5368.249999999999, 70),
 (5337.439999999999, 72),
 (4647.130000000001, 74),
 (4904.210000000001, 76),
 (4524.51, 78),
 (4727.86, 80),
 (4812.49, 82),
 (4652.9400000000005, 84),
 (4908.810000000001, 86),
 (4830.55, 88),
 (52

In [71]:
'''
Use sortByKey()
'''
result = flipped.sortByKey()
result.collect()

[(3309.3799999999997, 45),
 (3790.5699999999997, 79),
 (3924.2300000000005, 96),
 (4042.65, 23),
 (4172.29, 99),
 (4178.5, 75),
 (4278.049999999999, 36),
 (4297.259999999999, 98),
 (4316.299999999998, 47),
 (4327.73, 77),
 (4367.619999999999, 13),
 (4384.33, 48),
 (4394.6, 49),
 (4475.570000000001, 94),
 (4505.79, 67),
 (4517.2699999999995, 50),
 (4524.51, 78),
 (4561.07, 5),
 (4628.4, 57),
 (4635.8, 83),
 (4642.26, 91),
 (4647.130000000001, 74),
 (4652.9400000000005, 84),
 (4659.63, 3),
 (4664.589999999999, 12),
 (4681.92, 66),
 (4701.02, 56),
 (4707.41, 21),
 (4727.86, 80),
 (4735.030000000001, 14),
 (4735.200000000001, 37),
 (4755.069999999999, 7),
 (4756.890000000001, 44),
 (4765.049999999999, 31),
 (4812.49, 82),
 (4815.050000000001, 4),
 (4819.7, 10),
 (4830.55, 88),
 (4836.860000000001, 20),
 (4851.48, 89),
 (4876.839999999998, 95),
 (4898.460000000001, 38),
 (4904.210000000001, 76),
 (4908.810000000001, 86),
 (4915.89, 27),
 (4921.269999999999, 18),
 (4945.300000000001, 53),
 (