# Notebook for Getting Top Tokens

* Use a sample set of (key, values) in order to figure this damn thing out

In [1]:
%run '../spark_variables.ipynb'

In [2]:
#start the SparkContext
from pyspark import SparkContext

sc = SparkContext()

### Create a Sample Dataset

* Something simpler than the tweets
* Mimick what comes out of the tokenizer as a SET {  }
* The set will automatically remove duplicate tokens / words in each tweet

In [3]:
rdd = sc.parallelize([('111',{'a','b','c'}), ('111',{'a','b','d'}), ('222',{'a','b','e'}),('333',{'a','d','g'})])
rdd.collect()

[('111', {'a', 'b', 'c'}),
 ('111', {'a', 'b', 'd'}),
 ('222', {'a', 'b', 'e'}),
 ('333', {'a', 'd', 'g'})]

### Create a new RRD that has the count of each token

* each time the token appears per key, you count it once
* when the token appears again for a second user, then you count it again

In [4]:
token_counts = rdd.reduceByKey(lambda a,b: a|b)\
                    .flatMap(lambda x: ((x[0],y) for y in x[1]))\
                    .map(lambda x: (x[1],1))\
                    .reduceByKey(lambda x,y: x+y)

In [5]:
print(token_counts.collect())
print(token_counts.count())

[('d', 2), ('c', 1), ('b', 2), ('g', 1), ('a', 3), ('e', 1)]
6


### Count the Top Tokens

* Because this is a small dataset, we'll do >= 2


In [6]:
top_tokens = token_counts.filter(lambda x: x[1] >= 2).sortBy(lambda x: x[1],ascending=False).cache()

frequent_tokens = top_tokens.count()

top_x = top_tokens.take(5)

print(frequent_tokens)
print(top_x)

3
[('a', 3), ('d', 2), ('b', 2)]


In [7]:
sc.stop()