# Spark Transformations through Wordcount 

In [13]:
integer_RDD = sc.parallelize(range(10), 3)
# Gather all data on the partition
integer_RDD.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [11]:
text_RDD = sc.textFile('./textfile_1.txt')

In [12]:
type(text_RDD)

pyspark.rdd.RDD

### The `map` step 

In [3]:
def split_word(line):
    return line.split()

def create_pair(word):
    return (word,1)

pairs_RDD = text_RDD.flatMap(split_word).map(create_pair)
print pairs_RDD.collect()

[(u'A', 1), (u'quick', 1), (u'brown', 1), (u'fox', 1), (u'jumped', 1), (u'over', 1), (u'a', 1), (u'lazy', 1), (u'dog.', 1), (u'A', 1), (u'quick', 1), (u'brown', 1), (u'dog', 1), (u'jumped', 1), (u'over', 1), (u'a', 1), (u'lazy', 1), (u'fox.', 1)]


### The `reduce` step

In [4]:
def sum_counts(a, b):
    return a + b
wordcounts_RDD = pairs_RDD.reduceByKey(sum_counts)
wordcounts_RDD.collect()

[(u'A', 2),
 (u'dog.', 1),
 (u'lazy', 2),
 (u'over', 2),
 (u'fox', 1),
 (u'a', 2),
 (u'quick', 2),
 (u'brown', 2),
 (u'dog', 1),
 (u'jumped', 2),
 (u'fox.', 1)]

### Transformations

#### `flatMap`

In [7]:
words_RDD = text_RDD.flatMap(split_word)
words_RDD.collect()

[u'A',
 u'quick',
 u'brown',
 u'fox',
 u'jumped',
 u'over',
 u'a',
 u'lazy',
 u'dog.',
 u'A',
 u'quick',
 u'brown',
 u'dog',
 u'jumped',
 u'over',
 u'a',
 u'lazy',
 u'fox.']

#### `filter`

In [8]:
def starts_with_a(word):
    return word.lower().startswith("a")
words_RDD.filter(starts_with_a).collect()

[u'A', u'a', u'A', u'a']

#### `groupByKey`

In [9]:
pairs_RDD.groupByKey().collect()

[(u'A', <pyspark.resultiterable.ResultIterable at 0x7fcde8bc51d0>),
 (u'dog.', <pyspark.resultiterable.ResultIterable at 0x7fcde8bc5650>),
 (u'lazy', <pyspark.resultiterable.ResultIterable at 0x7fcde8bc56d0>),
 (u'over', <pyspark.resultiterable.ResultIterable at 0x7fcde8bc5710>),
 (u'fox', <pyspark.resultiterable.ResultIterable at 0x7fcde8bc5750>),
 (u'a', <pyspark.resultiterable.ResultIterable at 0x7fcde8bc5790>),
 (u'quick', <pyspark.resultiterable.ResultIterable at 0x7fcde8bc57d0>),
 (u'brown', <pyspark.resultiterable.ResultIterable at 0x7fcde8bc5810>),
 (u'dog', <pyspark.resultiterable.ResultIterable at 0x7fcde8bc5850>),
 (u'jumped', <pyspark.resultiterable.ResultIterable at 0x7fcde8bc5890>),
 (u'fox.', <pyspark.resultiterable.ResultIterable at 0x7fcde8bc58d0>)]

In [10]:
for k,v in pairs_RDD.groupByKey().collect():
    print "Key:", k, ", Values:", list(v)

Key: A , Values: [1, 1]
Key: dog. , Values: [1]
Key: lazy , Values: [1, 1]
Key: over , Values: [1, 1]
Key: fox , Values: [1]
Key: a , Values: [1, 1]
Key: quick , Values: [1, 1]
Key: brown , Values: [1, 1]
Key: dog , Values: [1]
Key: jumped , Values: [1, 1]
Key: fox. , Values: [1]
