### Verify Spark Context object
Check whether spark context is available. If there is an exception, you have to troubleshoot. Otherwise, good to go ahead.

In [1]:
sc

<pyspark.context.SparkContext at 0x7fe705e26828>

### Build a rdd from a python collection

In [2]:
distData = sc.parallelize([1, 2, 3, 4])
distData

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:475

In [3]:
distData.collect()

[1, 2, 3, 4]

### Basic Statistics using RDD of numbers

In [4]:
from random import random

In [5]:
l = [random() for _ in range(10)]
l

[0.09126739620741864,
 0.6088649788783818,
 0.14085507465130842,
 0.036607534023739174,
 0.8389527552169032,
 0.6471827609841071,
 0.5432516459707021,
 0.24845442208483226,
 0.7483825380122531,
 0.30687619311500813]

In [6]:
rdd = sc.parallelize(l, numSlices=4) # no of partitions = 4

In [7]:
rdd.max(), rdd.sum()

(0.8389527552169032, 4.210695299144654)

In [8]:
rdd.stats()

(count: 10, mean: 0.42106952991446545, stdev: 0.27586585452, max: 0.838952755217, min: 0.0366075340237)

### Exercise 1:

1. Create a directory in HDFS called movielens and load movies.csv and ratings.csv from ~/Downloads/datasets/movie-lens directory.  
2. Load movies.csv into in moviesRdd
3. Load ratings.csv into ratingsRdd
4. Find out 10 top movies based on the highest average rating. Consider only those movies that have got at least 100 ratings.


In [9]:
!ls -l ~/Downloads/datasets/movie-lens

total 3424
-rw-r--r-- 1 cloudera cloudera  207997 Jul  2 20:49 links.csv
-rw-r--r-- 1 cloudera cloudera  515700 Jul  2 20:49 movies.csv
-rw-r--r-- 1 cloudera cloudera 2580392 Jul  2 20:49 ratings.csv
-rw-r--r-- 1 cloudera cloudera  199073 Jul  2 20:49 tags.csv


In [10]:
movies = sc.textFile("/user/cloudera/movielens/movies")
movies.count()

10330

In [11]:
for r in movies.take(10):
    print(r)

movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children
9,Sudden Death (1995),Action


In [12]:
moviesData = movies.filter(lambda line: not line.startswith("movieId"))
for r in moviesData.take(10):
    print(r)

1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children
9,Sudden Death (1995),Action
10,GoldenEye (1995),Action|Adventure|Thriller


In [13]:
ratings = sc.textFile("/user/cloudera/movielens/ratings")
ratings.count()

105340

In [14]:
for r in ratings.take(10):
    print(r)

userId,movieId,rating,timestamp
1,16,4.0,1217897793
1,24,1.5,1217895807
1,32,4.0,1217896246
1,47,4.0,1217896556
1,50,4.0,1217896523
1,110,4.0,1217896150
1,150,3.0,1217895940
1,161,4.0,1217897864
1,165,3.0,1217897135


In [15]:
ratingsData = ratings.filter(lambda line: not line.startswith("userId"))
for r in ratingsData.take(10):
    print(r)

1,16,4.0,1217897793
1,24,1.5,1217895807
1,32,4.0,1217896246
1,47,4.0,1217896556
1,50,4.0,1217896523
1,110,4.0,1217896150
1,150,3.0,1217895940
1,161,4.0,1217897864
1,165,3.0,1217897135
1,204,0.5,1217895786


In [16]:
movies.first()

'movieId,title,genres'

In [17]:
ratings.first()

'userId,movieId,rating,timestamp'

In [18]:
movies_by_movieid = moviesData\
.map(lambda line: line.split(","))\
.map(lambda tokens: (int(tokens[0]), tokens[1]))

movies_by_movieid.first()

(1, 'Toy Story (1995)')

In [19]:
ratings_by_movieid = ratingsData\
.map(lambda line: line.split(","))\
.map(lambda tokens: (int(tokens[1]), float(tokens[2])))

ratings_by_movieid.first()

(16, 4.0)

In [20]:
top_10 = movies_by_movieid.join(ratings_by_movieid)\
.map(lambda p: ((p[0], p[1][0]), p[1][1]))\
.groupByKey().mapValues(lambda values: (sum(values)/len(values), len(values)))\
.filter(lambda p: p[1][1] >= 100)\
.sortBy(lambda p: p[1], False)\
.take(10)

for m in top_10:
    print(m)

((318, '"Shawshank Redemption'), (4.454545454545454, 308))
((858, '"Godfather'), (4.392857142857143, 210))
((50, '"Usual Suspects'), (4.328947368421052, 228))
((1136, 'Monty Python and the Holy Grail (1975)'), (4.3019480519480515, 154))
((527, "Schindler's List (1993)"), (4.296370967741935, 248))
((1193, "One Flew Over the Cuckoo's Nest (1975)"), (4.2727272727272725, 143))
((608, 'Fargo (1996)'), (4.2711442786069655, 201))
((2571, '"Matrix'), (4.264367816091954, 261))
((1221, '"Godfather: Part II'), (4.260714285714286, 140))
((1213, 'Goodfellas (1990)'), (4.2592592592592595, 135))


In [21]:
movies_by_movieid.join(ratings_by_movieid)\
.first()


(4096, ('"Curse', 4.0))

In [22]:
movies_by_movieid.join(ratings_by_movieid)\
.map(lambda p: ((p[0], p[1][0]), p[1][1]))\
.first()


((4096, '"Curse'), 4.0)

In [23]:
top_10 = movies_by_movieid.join(ratings_by_movieid)\
.map(lambda p: ((p[0], p[1][0]), p[1][1]))\
.groupByKey().mapValues(lambda values: (sum(values)/len(values), len(values)))\
.filter(lambda p: p[1][1] >= 100)

top_10.first()

((1036, 'Die Hard (1988)'), (3.918181818181818, 165))

In [24]:
top_10 = movies_by_movieid.join(ratings_by_movieid)\
.map(lambda p: ((p[0], p[1][0]), p[1][1]))\
.groupByKey().mapValues(lambda values: (sum(values)/len(values), len(values)))\
.filter(lambda p: p[1][1] >= 100)\
.sortBy(lambda p: p[1], False)\

for m in top_10.take(10):
    print(m)

((318, '"Shawshank Redemption'), (4.454545454545454, 308))
((858, '"Godfather'), (4.392857142857143, 210))
((50, '"Usual Suspects'), (4.328947368421052, 228))
((1136, 'Monty Python and the Holy Grail (1975)'), (4.3019480519480515, 154))
((527, "Schindler's List (1993)"), (4.296370967741935, 248))
((1193, "One Flew Over the Cuckoo's Nest (1975)"), (4.2727272727272725, 143))
((608, 'Fargo (1996)'), (4.2711442786069655, 201))
((2571, '"Matrix'), (4.264367816091954, 261))
((1221, '"Godfather: Part II'), (4.260714285714286, 140))
((1213, 'Goodfellas (1990)'), (4.2592592592592595, 135))


## Exercise 2: use stocks.csv file for the this exercise
 
- Load the stocks.csv file in a new Rdd 
- Find stocks records in 2016 
- Find stocks top 10 records based on the trading volume in 2016 
- Find average volume per stock traded in 2016 
- Find top 10 stocks based on highest trading volume in 2016  
 

Load the stocks.csv file in a new Rdd

In [25]:
stocks = sc.textFile("stocks")

In [26]:
stocks.count()

1857093

In [27]:
stocks.first()

'date,open,high,low,close,volume,adjclose,symbol'

In [28]:
stocks.cache()

stocks MapPartitionsRDD[84] at textFile at NativeMethodAccessorImpl.java:0

Find stocks records in 2016

In [29]:
stocks2016 = stocks.filter(lambda line: line.startswith("2016"))
stocks2016.count()

78041

Find stocks top 10 records based on the trading volume in 2016.

In [30]:
stocks2016.first()

'2016-01-04,46.119999,46.130001,45.360001,45.799999,3472200.0,44.870315,XLNX'

In [31]:
stocks2016.sortBy(lambda line: line.split(",")[5], False).take(10)

['2016-01-26,61.740002,62.509998,61.509998,62.349998,999900.0,61.430663,AWK',
 '2016-08-03,75.599998,76.099998,75.559998,76.089996,999900.0,75.520442,KLAC',
 '2016-05-23,12.48,12.56,12.31,12.48,9997600.0,12.307377,KEY',
 '2016-02-22,40.139999,40.93,40.07,40.84,9997300.0,40.60075,AAL',
 '2016-07-12,38.470001,38.91,38.240002,38.790001,9997200.0,38.790001,PYPL',
 '2016-04-11,104.059998,104.059998,102.300003,102.620003,999700.0,102.350541,INTU',
 '2016-02-01,48.939999,49.990002,48.790001,49.900002,9996700.0,48.283396,SO',
 '2016-06-24,41.869999,42.73,40.630001,40.970001,9996400.0,40.970001,VIAB',
 '2016-01-04,117.25,117.730003,115.870003,117.580002,9995000.0,115.844428,MCD',
 '2016-02-23,131.289993,131.910004,130.910004,131.529999,999500.0,129.687866,KMB']

Find average volume per stock traded in 2016

In [32]:
stocks2016.map(lambda line: line.split(","))\
.map(lambda tokens: (tokens[7], float(tokens[5])))\
.first()

('XLNX', 3472200.0)

In [33]:
stocks2016.map(lambda line: line.split(","))\
.map(lambda tokens: (tokens[7], float(tokens[5])))\
.groupByKey().mapValues(lambda volumes: sum(volumes)/ len(volumes))\
.first()

('EW', 1808258.9743589743)

Find top 10 stocks based on highest trading volume in 2016

In [34]:
top10 = stocks2016.map(lambda line: line.split(","))\
.map(lambda tokens: (tokens[7], float(tokens[5])))\
.groupByKey().mapValues(lambda volumes: sum(volumes)/ len(volumes))\
.sortBy(lambda p: p[1], False)\
.take(10)

for r in top10:
    print(r)


('BAC', 109953689.74358974)
('FCX', 47979558.333333336)
('CHK', 41622735.256410256)
('AAPL', 40944183.974358976)
('GE', 37751663.461538464)
('F', 37432197.43589743)
('PFE', 35777183.974358976)
('MSFT', 34194448.07692308)
('FB', 28902566.025641024)
('MU', 27260807.692307692)


In [35]:
stocksData = stocks.filter(lambda line: not line.startswith("date"))
stocksData.first()

'2000-07-17,95.4375,97.5,92.75,96.625,3508100.0,74.269199,XLNX'

### Parse the date field as datetime object

In [36]:
from datetime import datetime

In [37]:
d = datetime.strptime("2016-07-17", "%Y-%m-%d")

In [38]:
type(d)

datetime.datetime

In [39]:
d.year

2016

In [40]:
stocksData.filter(lambda line: datetime.strptime(line.split(",")[0],"%Y-%m-%d").year == 2016).first()

'2016-01-04,46.119999,46.130001,45.360001,45.799999,3472200.0,44.870315,XLNX'

### Call foreachPartition or mapPartition to operate over entire partition

In [41]:
def save_to_db(c):
    """
    More practitical use cases: 
        A. open database connection from each partition and save the record in bulk
        B. make web service calls and send multiple records, if web service calls support that
    
    Steps: 
        1. open db connection or web service connection
        2. send all c in bulk operation 
        3. close the connection
    """
    print(" ".join(c))

rdd = sc.parallelize([str(random()) for _ in range(10)], 3)
print(rdd.collect())
rdd.foreachPartition(save_to_db)
# Check jupyter launching console to view the numbers

['0.6688504710050863', '0.5298388694075957', '0.04317059809978663', '0.43160772581803797', '0.13492859253903766', '0.3782786241535713', '0.20573063094285304', '0.563796213609239', '0.716333149130251', '0.07209716251966614']


#### Collect each partition 

In [42]:
rdd = sc.parallelize(list("ABCEFGH"), 3)
rdd.collect()

['A', 'B', 'C', 'E', 'F', 'G', 'H']

In [43]:
rdd.glom().collect()

[['A', 'B'], ['C', 'E'], ['F', 'G', 'H']]

### Save stocks records in multiple files - one for each symbol

In [44]:
symbols = stocks.map(lambda l: l.split(",")[7]).distinct().collect() # Find disinct stock symbols
symbols = sorted(symbols)  # Sort the symbols alphabatically 

# Add an index for each symbol and convert the collection to Map for lookup
symbols = dict(list(zip(symbols, range(len(symbols)))))
print(symbols)

{'FTR': 190, 'EW': 164, 'M': 284, 'DFS': 122, 'ALB': 26, 'SIG': 401, 'WMT': 483, 'CBS': 78, 'D': 118, 'LNC': 276, 'GT': 205, 'NTRS': 333, 'SWKS': 419, 'TGNA': 430, 'SYMC': 423, 'PH': 359, 'HOT': 221, 'AVB': 46, 'AFL': 19, 'EFX': 147, 'BLL': 66, 'PFG': 356, 'HCP': 214, 'LMT': 275, 'HD': 215, 'CMA': 92, 'MYL': 317, 'CMI': 96, 'MPC': 308, 'UA.C': 448, 'NEM': 322, 'DNB': 133, 'DG': 123, 'FAST': 170, 'YUM': 498, 'GIS': 195, 'MMM': 302, 'LB': 266, 'EIX': 148, 'URI': 458, 'FOX': 185, 'GE': 192, 'TDG': 428, 'UAL': 449, 'SYK': 422, 'ESS': 160, 'AIZ': 23, 'HSIC': 228, 'AET': 18, 'COH': 102, 'AGN': 20, 'CPB': 106, 'GPN': 201, 'WHR': 479, 'GM': 197, 'CAG': 73, 'CI': 88, 'TEL': 429, 'EA': 143, 'WDC': 475, 'GOOGL': 199, 'TIF': 432, 'KEY': 253, 'XRX': 495, 'FBHS': 172, 'UHS': 451, 'ES': 158, 'AMZN': 37, 'DISCK': 129, 'IRM': 241, 'PCAR': 349, 'HBI': 211, 'BHI': 62, 'HPE': 223, 'NWL': 336, 'MRK': 309, 'WMB': 482, 'FTI': 189, 'CME': 94, 'CB': 76, 'FLS': 183, 'ALXN': 30, 'HRL': 226, 'PEG': 353, 'ADI': 10

In [45]:
len(symbols)

502

In [46]:
stocks_partitioned = (stocks
.keyBy(lambda r: symbols.get(r.split(",")[7]))
.partitionBy(len(symbols), lambda i: i)
.map(lambda p: p[1], preservesPartitioning = True))

stocks_partitioned.saveAsTextFile("stocks_by_symbol")

In [47]:
print("partitioner:", stocks_partitioned.partitioner,
      ", num_partitions: ", stocks_partitioned.partitioner.numPartitions)

partitioner: <pyspark.rdd.Partitioner object at 0x7fe70502b9b0> , num_partitions:  502
