In [None]:
from pyspark.sql import  SparkSession, SQLContext
spark = SparkSession\
   .builder\
   .master("local[2]")\
   .appName("SDG_Chapter12")\
   .getOrCreate()
sc = spark.sparkContext
sqlContext= SQLContext

In [2]:
r = spark.range(300).rdd

In [3]:
r.count()

300

In [4]:
r.first()

Row(id=0)

In [5]:
spark.range(10).toDF("id").rdd.map(lambda row: row[0])

PythonRDD[12] at RDD at PythonRDD.scala:53

In [6]:
df = spark.range(11).toDF("id1")

In [7]:
df.columns

['id1']

In [8]:
r.histogram(1)

([Row(id=0), Row(id=299)], [300])

In [9]:
spark.range(10).toDF("id2")

DataFrame[id2: bigint]

In [10]:
r.setName("r")

r MapPartitionsRDD[4] at javaToPython at NativeMethodAccessorImpl.java:0

In [11]:
r.name()

'r'

In [12]:
myCollection = "Spark The Definite Guide : Big Data Processing Made Simple".split(" ")
words = sc.parallelize(myCollection, 2)

In [13]:
words.setName("myWords")
words.name()

'myWords'

In [14]:
words.distinct().count()

10

In [15]:
def startsWithS(individual):
    return individual.startswith("S")

In [16]:
words.filter(lambda x: startsWithS(x)).collect()

['Spark', 'Simple']

In [17]:
words2 = words.map(lambda word: (word, word[0], word.startswith("S")))

In [18]:
words2.collect()

[('Spark', 'S', True),
 ('The', 'T', False),
 ('Definite', 'D', False),
 ('Guide', 'G', False),
 (':', ':', False),
 ('Big', 'B', False),
 ('Data', 'D', False),
 ('Processing', 'P', False),
 ('Made', 'M', False),
 ('Simple', 'S', True)]

In [19]:
words2.filter(lambda record: record[2]).take(5)

[('Spark', 'S', True), ('Simple', 'S', True)]

### filter is equivalent to where in dataframe


In [20]:
words.flatMap(lambda word: list(word)).collect()

['S',
 'p',
 'a',
 'r',
 'k',
 'T',
 'h',
 'e',
 'D',
 'e',
 'f',
 'i',
 'n',
 'i',
 't',
 'e',
 'G',
 'u',
 'i',
 'd',
 'e',
 ':',
 'B',
 'i',
 'g',
 'D',
 'a',
 't',
 'a',
 'P',
 'r',
 'o',
 'c',
 'e',
 's',
 's',
 'i',
 'n',
 'g',
 'M',
 'a',
 'd',
 'e',
 'S',
 'i',
 'm',
 'p',
 'l',
 'e']

In [21]:
words.sortBy(lambda word: len(word) * -1).take(2)

['Processing', 'Definite']

In [22]:
words.sortBy(lambda word: len(word)).collect()

[':',
 'The',
 'Big',
 'Data',
 'Made',
 'Spark',
 'Guide',
 'Simple',
 'Definite',
 'Processing']

In [23]:
fiftyFiftySplit = words.randomSplit([0.5, 0.5])

In [26]:
type(fiftyFiftySplit)

list

In [27]:
fiftyFiftySplit

[PythonRDD[40] at RDD at PythonRDD.scala:53,
 PythonRDD[41] at RDD at PythonRDD.scala:53]

In [28]:
fiftyFiftySplit[0].collect()

['Definite', 'Guide', 'Data', 'Made', 'Simple']

In [29]:
fiftyFiftySplit[1].collect()

['Spark', 'The', ':', 'Big', 'Processing']

# Actions

In [34]:
sc.parallelize(range(1, 21)).reduce(lambda x, y: x+y)

210

In [35]:
df.rdd.reduce(lambda x,y: x+y)

(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)

In [36]:
def wordLengthReducer(leftWord, rightWord):
    if len(leftWord) > len(rightWord):
        return leftWord
    else:
        return rightWord

In [38]:
words.reduce(wordLengthReducer)

'Processing'

In [39]:
confidence = 0.95
timeoutMilliseconds = 400
words.countApprox(timeout=timeoutMilliseconds, confidence=confidence)

10

In [40]:
words.countApproxDistinct(0.05)

10

In [41]:
words.countApproxDistinct(4, 10)

TypeError: countApproxDistinct() takes from 1 to 2 positional arguments but 3 were given

In [42]:
words.countByValue()

defaultdict(int,
            {'Spark': 1,
             'The': 1,
             'Definite': 1,
             'Guide': 1,
             ':': 1,
             'Big': 1,
             'Data': 1,
             'Processing': 1,
             'Made': 1,
             'Simple': 1})

In [43]:
words.countByKey()

defaultdict(int,
            {'S': 2, 'T': 1, 'D': 2, 'G': 1, ':': 1, 'B': 1, 'P': 1, 'M': 1})

In [44]:
words.saveAsTextFile("file:/tmp/bookTitleCompressed")

In [45]:
words.cache()

myWords ParallelCollectionRDD[15] at parallelize at PythonRDD.scala:195

In [46]:
words.getStorageLevel()

StorageLevel(False, True, False, False, 1)

In [48]:
sc.setCheckpointDir("/tmp/bookTitleCompressed")
words.checkpoint()

In [49]:
words.pipe("wc -l").collect()

['5', '5']

In [50]:
words.mapPartitions(lambda part: [1]).sum()

2