In [0]:
from pyspark import SparkContext
sc = SparkContext()
sc

In [0]:
# Find the most frequent word. Output this word and its frequency.
from operator import add
lines = sc.textFile('data/README.md')
counts = lines.flatMap(lambda x: x.split()) \
              .map(lambda x: (x, 1)) \
              .reduceByKey(add)

# Solution 1:
counts.sortBy(lambda x: x[1], ascending=False).first()

# Solution 2:
# counts.max(lambda x: x[1])

In [3]:
# Count the frequencies of those words consisting of 5 or more characters
from operator import add
lines = sc.textFile('README.md')
counts = lines.flatMap(lambda x: x.split()) \
              .filter(lambda z: len(z) >= 5) \
              .map(lambda x: (x, 1)) \
              .reduceByKey(add)
print(counts.take(10))

[('Apache', 1), ('Spark', 16), ('provides', 1), ('high-level', 1), ('Scala,', 1), ('Java,', 1), ('optimized', 1), ('engine', 1), ('supports', 2), ('computation', 1)]


In [4]:
# What's its output?
A = sc.parallelize(range(1, 100))
t = 50
B = A.filter(lambda x: x < t)
print (B.count())
t = 10
C = B.filter(lambda x: x > t)
print (C.count())
# Because B hasn't do cache(), it need to re-compute from parent A

49
0


In [5]:
# Fix the bug
A = sc.parallelize(range(1, 100))
t = 50
B = A.filter(lambda x: x < t)
print (B.count())
B.cache()
t = 10
C = B.filter(lambda x: x > t)
print (C.count())

49
39


In [6]:
# Modify the PMI example by sending a_dict and n_dict inside the closure. Do not use broadcast variables.
from math import *

lines = sc.textFile('data/adj_noun_pairs.txt')

# Converting lines into word pairs. 
# Data is dirty: some lines have more than 2 words, so filter them out.
pairs = lines.map(lambda l: tuple(l.split())).filter(lambda p: len(p)==2)
pairs.cache()
N = pairs.count()

# Compute the frequency of each pair.
# Ignore pairs that not frequent enough
pair_freqs = pairs.map(lambda p: (p,1)).reduceByKey(lambda f1, f2: f1 + f2) \
                  .filter(lambda pf: pf[1] >= 100)

# Computing the frequencies of the adjectives and the nouns
a_freqs = pairs.map(lambda p: (p[0],1)).reduceByKey(lambda x,y: x+y)
n_freqs = pairs.map(lambda p: (p[1],1)).reduceByKey(lambda x,y: x+y)

# Make a_dict and n_dict
a_dict = a_freqs.collectAsMap()
n_dict = n_freqs.collectAsMap()

# Computing the PMI for a pair.
def pmi_score(pair_freq, a_dict, n_dict):
    w1, w2 = pair_freq[0]
    f = pair_freq[1]
    pmi = log(float(f)*N/(a_dict[w1]*n_dict[w2]), 2)
    return pmi, (w1, w2)

# Note:
# Before broadcasting, a_dict and n_dict are <class 'dict'>
# After broadcasting, a_dict and n_dict are <class 'pyspark.broadcast.Broadcast'>
   
# Don't using broadcast variables way
# Computing the PMI for all pairs. Using lamdba to pass a_dict and n_dict into function pmi_score.
scored_pairs = pair_freqs.map(lambda x: pmi_score(x, a_dict, n_dict))

# Show the top 10 samples
scored_pairs.top(10)

[(14.41018838546462, ('magna', 'carta')),
 (13.071365888694997, ('polish-lithuanian', 'Commonwealth')),
 (12.990597616733414, ('nitrous', 'oxide')),
 (12.64972604311254, ('latter-day', 'Saints')),
 (12.50658937509916, ('stainless', 'steel')),
 (12.482331020687814, ('pave', 'runway')),
 (12.19140721768055, ('corporal', 'punishment')),
 (12.183248694293388, ('capital', 'punishment')),
 (12.147015483562537, ('rush', 'yard')),
 (12.109945794428935, ('globular', 'cluster'))]

In [18]:
# For each item in the RDD, add its partition number to it, and write the results to another RDD
A = sc.parallelize(range(100), 4)
def add_index(index, part):
    for i in part:
        i += index
        yield i
B = A.mapPartitionsWithIndex(add_index)

# print(A.mapPartitionsWithIndex(add_index).collect())
print (B.collect())
    

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102]
