# Name: Ziling Huang
# username: zhu51

In [3]:
# start the Spark context
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q pyspark


import os
os.environ['PYTHONHASHSEED']="0"
os.environ["PYSPARK_PYTHON"]="python3"
os.environ["JAVA_HOME"]="/usr/lib/jvm/java-8-openjdk-amd64/"
# A few additional libraries we will need
from math import sqrt

import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import *

try:
  conf = SparkConf().setMaster("local[*]").set("spark.executor.memory", "1g").set("spark.executorEnv.PYTHONHASHSEED","0").set("spark.ui.port", "4050")
  sc = SparkContext(conf = conf)
  spark = SparkSession.builder.getOrCreate()
except ValueError:
  #it's ok if the server is already started
  pass

def dbg(x):
  """ A helper function to print debugging information on RDDs """
  if isinstance(x, pyspark.RDD):
    print([(t[0], list(t[1]) if 
            isinstance(t[1], pyspark.resultiterable.ResultIterable) else t[1])
           if isinstance(t, tuple) else t
           for t in x.take(100)])
  else:
    print(x)
    

import unittest
Test = unittest.TestCase()

[K     |████████████████████████████████| 281.4 MB 31 kB/s 
[K     |████████████████████████████████| 198 kB 44.0 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
pair = ('a','b') 
print(pair[0], pair[1])

a b


In [4]:
#Let’s consider a quick example
pets = sc.parallelize([('cat',1), ('dog',3), ('cat',2),('dog',1),('hamster',1)]) 
dbg(pets.reduceByKey(lambda x,y: x+y))
dbg(pets.groupByKey())
dbg(pets.sortByKey())

[('cat', 3), ('dog', 4), ('hamster', 1)]
[('cat', [1, 2]), ('dog', [3, 1]), ('hamster', [1])]
[('cat', 1), ('cat', 2), ('dog', 3), ('dog', 1), ('hamster', 1)]


In [5]:
#Let’s try for a more complex example, word count and working with files. 
#First use a shell command to download the text of Peter Pan from the Guttenberg project
!wget -q -O peterpan.txt https://www.gutenberg.org/files/16/16-0.txt
# load the file into a distributed dataset of lines
file = sc.textFile("peterpan.txt")
# split each line into (word, 1) tuples
words = file.flatMap(lambda line: [(word.lower(), 1) for word in line.split(" ")])
# reduce by key (the word) the counts and sort descending
counts = words.reduceByKey(lambda a, b: a + b).sortBy(lambda x: x[1], False) 
dbg(counts.collect())



# 3a. [20 points] Compute the TFij for each term i with j as Peter Pan

In [81]:
import time
time_start = time.time()

In [82]:
high = counts.take(1)[0][1]
TFij = counts.map(lambda x:(x[0], x[1]/high)).cache()
dbg(TFij)

[('the', 1.0), ('', 0.7872596153846154), ('and', 0.5677083333333334), ('to', 0.49399038461538464), ('he', 0.41225961538461536), ('a', 0.390625), ('of', 0.3870192307692308), ('was', 0.36217948717948717), ('in', 0.28725961538461536), ('it', 0.23798076923076922), ('that', 0.2327724358974359), ('she', 0.2299679487179487), ('they', 0.2263621794871795), ('had', 0.20192307692307693), ('his', 0.18870192307692307), ('you', 0.18028846153846154), ('but', 0.17708333333333334), ('for', 0.16266025641025642), ('not', 0.15865384615384615), ('her', 0.1482371794871795), ('with', 0.1482371794871795), ('is', 0.13661858974358973), ('at', 0.1346153846153846), ('as', 0.1346153846153846), ('on', 0.13421474358974358), ('i', 0.10136217948717949), ('have', 0.10056089743589744), ('be', 0.09975961538461539), ('were', 0.09735576923076923), ('peter', 0.09695512820512821), ('all', 0.09495192307692307), ('this', 0.09134615384615384), ('said', 0.08733974358974358), ('their', 0.08693910256410256), ('so', 0.0861378205128

# 3b. [10 points] Load the following 3 documents into RDDs
### The Adventures of Sherlock Holmes by Arthur Conan Doyle
http://www.gutenberg.org/files/1661/1661-0.txt
### Alice's Adventures in Wonderland by Lewis Carroll
http://www.gutenberg.org/files/11/11-0.txt
### Adventures of Huckleberry Finn by Mark Twain
http://www.gutenberg.org/files/76/76-0.txt


In [83]:
!wget -q -O SherlockHolmes.txt http://www.gutenberg.org/files/1661/1661-0.txt
!wget -q -O Alice.txt http://www.gutenberg.org/files/11/11-0.txt
!wget -q -O Huckleberry.txt http://www.gutenberg.org/files/76/76-0.txt

# 3c. [20 points] Compute IDFi for each term i over all N=4 documents in our RDDs

expected output for IDF_the (the inverse document frequency of the word ‘the’) is 0 because the word ‘the’ appears in 4 out of 4 documents and log2(4 / 4) == 0
expected output for some other words:
[('frightfully', 2.0), ('wendy,”', 2.0), ('wendy?”', 2.0), ('peter?”', 2.0), ('nibs,', 2.0), … (‘dogs’, 0.4150374992788437)...]


In [84]:
from math import log2
from typing import Union

# IDFi = log2(appear in how many/total num of doc)
PeterPanWord = file.flatMap(lambda words:[(word.lower(), 1) for word in words.split(" ")]).distinct()
PeterPanCount = PeterPanWord.reduceByKey(lambda a, b:a+b).sortBy(lambda i:i[1], False).cache()

# load the file into a distributed dataset of lines
Sherlockfile = sc.textFile("SherlockHolmes.txt")
SherlockWord = Sherlockfile.flatMap(lambda words:[(word.lower(), 1) for word in words.split(" ")]).distinct()
SherlockCount = SherlockWord.reduceByKey(lambda a, b:a+b).sortBy(lambda i:i[1], False).cache()

Alicefile = sc.textFile("Alice.txt")
# print(Alicefile.collect())
AliceWord = Alicefile.flatMap(lambda words:[(word.lower(), 1) for word in words.split(" ")]).distinct()
AliceCount = AliceWord.reduceByKey(lambda a, b:a+b).sortBy(lambda i:i[1], False).cache()

Hucklefile = sc.textFile("Huckleberry.txt")
HucklekWord = Hucklefile.flatMap(lambda words:[(word.lower(), 1) for word in words.split(" ")]).distinct()
HucklekCount = HucklekWord.reduceByKey(lambda a, b:a+b).sortBy(lambda i:i[1], False).cache()


In [85]:
result = (PeterPanCount
        .union(SherlockCount)
        .union(AliceCount)
        .union(HucklekCount)
        .groupByKey()
        .mapValues(len)
        .map(lambda a : (a[0], log2(4/a[1])))
        .cache())
dbg(result)
result.lookup("dogs")[0]

[('gutenberg', 0.0), ('of', 0.0), ('james', 0.4150374992788437), ('', 0.0), ('other', 0.0), ('are', 0.0), ('check', 0.0), ('where', 0.0), ('using', 0.0), ('author:', 0.0), ('date:', 0.0), ('february', 0.4150374992788437), ('set', 0.0), ('produced', 0.0), ('research', 0.0), ('start', 0.0), ('fulcrum', 1.0), ('current', 0.0), ('publication,', 2.0), ('ii.', 0.0), ('iv.', 0.0), ('v.', 0.0), ('vii.', 0.0), ('ix.', 0.0), ('bird', 0.0), ('x.', 0.0), ('believe', 0.0), ('xvi.', 1.0), ('when', 0.0), ('one,', 0.0), ('two', 0.0), ('garden,', 0.4150374992788437), ('plucked', 1.0), ('flower', 1.0), ('i', 0.0), ('rather', 0.0), ('heart', 0.0), ('cried,', 0.4150374992788437), ('why', 0.0), ('like', 0.0), ('them', 0.0), ('but', 0.0), ('came', 0.0), ('romantic', 2.0), ('mocking', 1.0), ('boxes,', 1.0), ('however', 1.0), ('mouth', 0.0), ('corner.', 0.4150374992788437), ('he', 0.0), ('trying', 0.0), ('thought', 0.0), ('stocks', 2.0), ('really', 0.0), ('seemed', 0.0), ('would', 0.0), ('married', 0.41503749

0.4150374992788437

# 3d. [20 points] Compute TF.IDF score for each term in Peter Pan and collect/display the top 100 terms by TF.IDF score

some expected output:
[('wendy', 0.16185897435897437), ('darling', 0.07532051282051282),...

In [86]:
TFIDF = TFij.union(result).reduceByKey(lambda a, b: a * b).sortBy(lambda a: a[0], False).cache()
dbg(TFIDF.lookup('wendy'))
dbg(TFIDF)

[0.16185897435897437]
[('“’twas', 0.0008012820512820513), ('“’tis', 2.0), ('“’s’death', 0.0008012820512820513), ('“’pon', 2.0), ('“‘—found', 2.0), ('“‘you', 2.0), ('“‘yes.’', 2.0), ('“‘yes,', 2.0), ('“‘why,’', 2.0), ('“‘why,', 2.0), ('“‘why', 2.0), ('“‘which', 2.0), ('“‘where', 2.0), ('“‘whatever', 2.0), ('“‘what,', 2.0), ('“‘what', 2.0), ('“‘well,’', 2.0), ('“‘well,', 2.0), ('“‘we', 2.0), ('“‘very', 2.0), ('“‘undoubtedly', 2.0), ('“‘tut,', 2.0), ('“‘to', 2.0), ('“‘this', 2.0), ('“‘they', 2.0), ('“‘there', 2.0), ('“‘then,', 2.0), ('“‘then', 2.0), ('“‘the', 2.0), ('“‘that’s', 2.0), ('“‘that', 2.0), ('“‘thank', 2.0), ('“‘ten', 2.0), ('“‘tell', 2.0), ('“‘surely', 2.0), ('“‘stolen!’', 2.0), ('“‘some', 2.0), ('“‘sold', 2.0), ('“‘so,’', 2.0), ('“‘she', 2.0), ('“‘see,', 0.0008012820512820513), ('“‘robert', 2.0), ('“‘quite', 2.0), ('“‘put', 2.0), ('“‘precisely.’', 2.0), ('“‘precisely', 2.0), ('“‘pooh!’', 2.0), ('“‘photography', 2.0), ('“‘perhaps', 2.0), ('“‘or', 2.0), ('“‘only', 2.0), ('“‘one'

# 3e. [10 points] 
Add overall process timing output to your code (try to include all processing but be careful not to print/collect any data except for the final top 100) to compute TF.IDF top 100 for Peter Pan:

import time

time_start = time.time()
…
print(rdd.take(100))

time_end = time.time()
print("elapsed time is %s" % str(time_end-time_start))

NOTE: my reference implementation takes around 4-6 seconds including all steps


In [87]:
print(TFIDF.take(100))
time_end = time.time()
print("Elapsed time is %s" % str(time_end-time_start))

[('“’twas', 0.0008012820512820513), ('“’tis', 2.0), ('“’s’death', 0.0008012820512820513), ('“’pon', 2.0), ('“‘—found', 2.0), ('“‘you', 2.0), ('“‘yes.’', 2.0), ('“‘yes,', 2.0), ('“‘why,’', 2.0), ('“‘why,', 2.0), ('“‘why', 2.0), ('“‘which', 2.0), ('“‘where', 2.0), ('“‘whatever', 2.0), ('“‘what,', 2.0), ('“‘what', 2.0), ('“‘well,’', 2.0), ('“‘well,', 2.0), ('“‘we', 2.0), ('“‘very', 2.0), ('“‘undoubtedly', 2.0), ('“‘tut,', 2.0), ('“‘to', 2.0), ('“‘this', 2.0), ('“‘they', 2.0), ('“‘there', 2.0), ('“‘then,', 2.0), ('“‘then', 2.0), ('“‘the', 2.0), ('“‘that’s', 2.0), ('“‘that', 2.0), ('“‘thank', 2.0), ('“‘ten', 2.0), ('“‘tell', 2.0), ('“‘surely', 2.0), ('“‘stolen!’', 2.0), ('“‘some', 2.0), ('“‘sold', 2.0), ('“‘so,’', 2.0), ('“‘she', 2.0), ('“‘see,', 0.0008012820512820513), ('“‘robert', 2.0), ('“‘quite', 2.0), ('“‘put', 2.0), ('“‘precisely.’', 2.0), ('“‘precisely', 2.0), ('“‘pooh!’', 2.0), ('“‘photography', 2.0), ('“‘perhaps', 2.0), ('“‘or', 2.0), ('“‘only', 2.0), ('“‘one', 2.0), ('“‘on', 2.0),