Working through https://spark.apache.org/docs/latest/rdd-programming-guide.html.

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext, SparkConf

In [3]:
conf = SparkConf().setAppName("foo").setMaster("local")
sc = SparkContext(conf=conf)

# Initial stuff

In [4]:
data = range(5)
distData = sc.parallelize(data)

In [6]:
distData.collect()

[0, 1, 2, 3, 4]

In [7]:
distData.reduce(lambda a, b: a + b)

10

In [8]:
distFile = sc.textFile("/opt/spark/README.md")

In [9]:
distFile.map(lambda r: len(r)).reduce(lambda a, b: a + b)

3706

# RDD operations

In [10]:
lines = sc.textFile("/opt/spark/README.md")
lineLengths = lines.map(lambda r: len(r))
totalLengths = lineLengths.reduce(lambda a, b: a + b)

In [11]:
totalLengths

3706

In [12]:
counter = 0
rdd = sc.parallelize(data)

In [20]:
# wrong - undefined (use an Accumulator instead)
# here the counter that each node sees is not the same as the counter
# that the driver sees, and that we refer to when the code's done
# the same thing is why something like rdd.map(println) won't work:
# the println happens on each executor node, not on the driver
# (you can call rdd.collect().map(println), but this requires that
# the whole RDD be copied to the single driver machine
def increment_counter(x):
    global counter
    counter += x

rdd.foreach(increment_counter)

counter

0

In [21]:
data

range(0, 5)

In [39]:
lines = sc.textFile("/opt/spark/README.md")
pairs = lines.map(lambda r: (r, 1))
counts = pairs.reduceByKey(lambda a, b: a + b)

# no sorting if both of these are commented out

# sort by the key, which is the text of the line here
#counts = pairs.sortByKey()

# sort by the value, which is here the count of lines
#counts = counts.sortBy(lambda kv: kv[1])

In [33]:
counts.collect()

[('# Apache Spark', 1),
 ('', 39),
 ('Spark is a fast and general cluster computing system for Big Data. It provides',
  1),
 ('high-level APIs in Scala, Java, Python, and R, and an optimized engine that',
  1),
 ('supports general computation graphs for data analysis. It also supports a',
  1),
 ('rich set of higher-level tools including Spark SQL for SQL and DataFrames,',
  1),
 ('MLlib for machine learning, GraphX for graph processing,', 1),
 ('and Spark Streaming for stream processing.', 1),
 ('<http://spark.apache.org/>', 1),
 ('## Online Documentation', 1),
 ('You can find the latest Spark documentation, including a programming', 1),
 ('guide, on the [project web page](http://spark.apache.org/documentation.html).',
  1),
 ('This README file only contains basic setup instructions.', 1),
 ('## Building Spark', 1),
 ('Spark is built using [Apache Maven](http://maven.apache.org/).', 1),
 ('To build Spark and its example programs, run:', 1),
 ('    build/mvn -DskipTests clean package'