# Setup

In [117]:
# Common imports
import os
import tarfile
from six.moves import urllib
from pyspark.context import SparkContext

# Initializing Spark
sc = SparkContext.getOrCreate()

# Get the data

We first use the text of the [Spark Wikipedia page (English version)](https://en.wikipedia.org/wiki/Apache_Spark)

Then, we consider a publication dataset containing information of over 3 million papers published in computer science conferences and journals (this data was derived from the DBLP system, maintained by Michael Ley at http://www.informatik.uni-trier.de/_ley/db/).

In [122]:
SPARK_WIKI_URL = "https://www.dropbox.com/s/5ctx25rm2xtls30/spark_wiki.txt?raw=1"
SPARK_WIKI_PATH = "/tmp"
SPARK_WIKI_FILE_NAME = "spark_wiki.txt"

def fetch_spark_wiki(url=SPARK_WIKI_URL, path=SPARK_WIKI_PATH, file_name=SPARK_WIKI_FILE_NAME):
    if not os.path.isdir(path):
        os.makedirs(path)
    full_path = os.path.join(path, file_name)
    urllib.request.urlretrieve(url, full_path)

In [138]:
DBLP_TSV_URL = "https://www.dropbox.com/s/4s7do56blmf8cz8/dblp_tsv.tar.gz?raw=1"
DBLP_TSV_PATH = "/tmp"

def fetch_dblp_tsv(url=DBLP_TSV_URL, path=DBLP_TSV_PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    tgz_path = os.path.join(path, "dblp_tsv.tar.gz")
    urllib.request.urlretrieve(url, tgz_path)
    dblp_tsv_tar = tarfile.open(tgz_path)
    dblp_tsv_tar.extractall(path=path)
    dblp_tsv_tar.close()

In [123]:
fetch_spark_wiki()

In [139]:
fetch_dblp_tsv()

# Create RDD

## By loading an external dataset

In [124]:
textFile = sc.textFile("/tmp/spark_wiki.txt")

In [125]:
textFile.count()

101

In [17]:
textFile.first()

'Apache Spark'

In [19]:
textFile.take(5)

['Apache Spark',
 'From Wikipedia, the free encyclopedia',
 "Apache Spark is an open-source distributed general-purpose cluster-computing framework. Originally developed at the University of California, Berkeley's AMPLab, the Spark codebase was later donated to the Apache Software Foundation, which has maintained it since. Spark provides an interface for programming entire clusters with implicit data parallelism and fault tolerance.",
 '',
 'Overview']

In [34]:
textFile.collect()

['Apache Spark',
 'From Wikipedia, the free encyclopedia',
 "Apache Spark is an open-source distributed general-purpose cluster-computing framework. Originally developed at the University of California, Berkeley's AMPLab, the Spark codebase was later donated to the Apache Software Foundation, which has maintained it since. Spark provides an interface for programming entire clusters with implicit data parallelism and fault tolerance.",
 '',
 'Overview',
 'Apache Spark has as its architectural foundation the resilient distributed dataset (RDD), a read-only multiset of data items distributed over a cluster of machines, that is maintained in a fault-tolerant way.[2] The Dataframe API was released as an abstraction on top of the RDD, followed by the Dataset API. In Spark 1.x, the RDD was the primary application programming interface (API), but as of Spark 2.x use of the Dataset API is encouraged[3] even though the RDD API is not deprecated.[4][5] The RDD technology still underlies the Datas

The DAG is executed when an action occurs.

Thus, the "Input path does not exist" error is not raise on the RDD creation but when the action occurs.

In [28]:
nonExistentTextFile = sc.textFile("/tmp/non_existent_file")

In [29]:
nonExistentTextFile.count()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: file:/tmp/non_existent_file
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:287)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:204)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.api.python.PythonRDD.getPartitions(PythonRDD.scala:55)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


## By Distributing a collection

In [23]:
numbers = sc.parallelize([1, 2, 3])

In [24]:
numbers.count()

3

In [26]:
numbers.first()

1

In [27]:
numbers.take(5)

[1, 2, 3]

In [31]:
numbers.collect()

[1, 2, 3]

# RDD Operations

## Transformations

### map

In [32]:
numbers = sc.parallelize([1, 2, 3])
squared_numbers = numbers.map(lambda x: x * x)
squared_numbers.collect()

[1, 4, 9]

In [36]:
def square(x):
    return x * x

numbers = sc.parallelize([1, 2, 3])
squared_numbers = numbers.map(square)
squared_numbers.collect()

[1, 4, 9]

### filter

In [35]:
numbers = sc.parallelize([1, 2, 3])
odd_numbers = numbers.filter(lambda x: x % 2 == 1)
odd_numbers.collect()

[1, 3]

In [37]:
def is_odd(x):
    return x % 2 == 1

numbers = sc.parallelize([1, 2, 3])
odd_numbers = numbers.filter(is_odd)
odd_numbers.collect()

[1, 3]

### flatMap

In [40]:
numbers = sc.parallelize([1, 2, 3])
flatten_numbers = numbers.flatMap(lambda x: [x, 2 * x, x * x])
flatten_numbers.collect()

[1, 2, 1, 2, 4, 4, 3, 6, 9]

### union: returns an RDD containing the union of the elements of the given RDDs

In [44]:
numbers = sc.parallelize([1, 2, 3])
other_numbers = sc.parallelize([3, 4, 5])
union = numbers.union(other_numbers)
union.collect()

[1, 2, 3, 3, 4, 5]

### intersection: returns an RDD containing the intersection of the elements of the given RDDs

In [45]:
numbers = sc.parallelize([1, 2, 3])
other_numbers = sc.parallelize([3, 4, 5])
intersection = numbers.intersection(other_numbers)
intersection.collect()

[3]

### distinct: returns an RDD containing the distinct elements of the given RDD

In [46]:
numbers = sc.parallelize([1, 2, 3, 2, 1])
distinct = numbers.distinct()
distinct.collect()

[1, 2, 3]

### reduceByKey

In [75]:
numbers = sc.parallelize([1, 2, 3, 2, 1])
number_pairs = numbers.map(lambda x: (x, 1))
number_count = number_pairs.reduceByKey(lambda a, b: a + b)
number_count.collect()

[(1, 1), (2, 2), (3, 2)]

### sortByKey

In [162]:
numbers = sc.parallelize([1, 2, 3, 2, 1])
number_pairs = numbers.map(lambda x: (x, 1))
number_count = number_pairs.reduceByKey(lambda a, b: a + b)
sorted_number_count = number_count.map(lambda pair: (pair[1], pair[0])).sortByKey(ascending=False)
sorted_number_count.map(lambda pair: (pair[1], pair[0])).collect()

[(1, 2), (2, 2), (3, 1)]

### join

In [78]:
left = sc.parallelize([(1, 2), (2, 2), (2, 3)])
right = sc.parallelize([(2, 4)])
join = left.join(right)
join.collect()

[(2, (2, 4)), (2, (3, 4))]

### leftOuterJoin

In [80]:
left = sc.parallelize([(1, 2), (2, 2), (2, 3)])
right = sc.parallelize([(2, 4)])
join = left.leftOuterJoin(right)
join.collect()

[(1, (2, None)), (2, (2, 4)), (2, (3, 4))]

### rightOuterJoin

In [81]:
left = sc.parallelize([(1, 2), (2, 2), (2, 3)])
right = sc.parallelize([(2, 4)])
join = left.rightOuterJoin(right)
join.collect()

[(2, (2, 4)), (2, (3, 4))]

### keys: returns an RDD containing the keys of the given pair RDD

In [82]:
pairs = sc.parallelize([(1, 2), (2, 2), (2, 3)])
pairs.keys().collect()

[1, 2, 2]

### values: returns an RDD containing the values of the given pair RDD

In [None]:
pairs = sc.parallelize([(1, 2), (2, 2), (2, 3)])
pairs.values().collect()

## Actions

### collect

In [85]:
numbers = sc.parallelize([1, 2, 3])
numbers.collect()

[1, 2, 3]

### count

In [87]:
numbers = sc.parallelize([1, 2, 3])
numbers.count()

3

### first

In [88]:
numbers = sc.parallelize([1, 2, 3])
numbers.first()

1

### take

In [89]:
numbers = sc.parallelize([1, 2, 3])
numbers.take(2)

[1, 2]

### takeSample

In [90]:
numbers = sc.parallelize([1, 2, 3, 4, 5, 6])
numbers.takeSample(withReplacement=True, num=3, seed=42)

[4, 6, 4]

In [93]:
numbers = sc.parallelize([1, 2, 3, 4, 5, 6])
numbers.takeSample(withReplacement=False, num=3, seed=42)

[4, 2, 3]

# Examples using the DBLP dataset

## Explore the dataset

In [142]:
papers = sc.textFile("/tmp/dblp_tsv/papers.tsv")

In [143]:
papers.count()

3150923

In [144]:
papers.first()

'0\tParallel Integer Sorting and Simulation Amongst CRCW Models.\t0\t607-619\thttp://dx.doi.org/10.1007/BF03036466'

In [145]:
papers.take(5)

['0\tParallel Integer Sorting and Simulation Amongst CRCW Models.\t0\t607-619\thttp://dx.doi.org/10.1007/BF03036466',
 '1\tPattern Matching in Trees and Nets.\t1\t227-248\thttp://dx.doi.org/10.1007/BF01257084',
 '2\tNP-complete Problems Simplified on Tree Schemas.\t1\t171-178\thttp://dx.doi.org/10.1007/BF00289414',
 '3\tOn the Power of Chain Rules in Context Free Grammars.\t3\t425-433\thttp://dx.doi.org/10.1007/BF00264161',
 '4\tSchnelle Multiplikation von Polynomenüber Körpern der Charakteristik 2.\t4\t395-398\thttp://dx.doi.org/10.1007/BF00289470']

In [146]:
paperauths = sc.textFile("/tmp/dblp_tsv/paperauths.tsv")

In [147]:
paperauths.count()

8997871

In [148]:
paperauths.first()

'1647633\t14170'

In [149]:
paperauths.take(5)

['1647633\t14170',
 '1125503\t848309',
 '1711987\t1184704',
 '2602130\t863089',
 '2150495\t704198']

In [150]:
venue = sc.textFile("/tmp/dblp_tsv/venue.tsv")

In [151]:
venue.count()

241272

In [152]:
venue.first()

'795226\tCoRR\t2014\t""\tabs/1409.0286\t""\t0'

In [153]:
venue.take(5)

['795226\tCoRR\t2014\t""\tabs/1409.0286\t""\t0',
 '840379\tComputational Biology and Chemistry\t2008\t""\t32\t1\t0',
 '107610\tEarth Science Informatics\t2008\t""\t1\t1\t0',
 '617896\tIJPEDS\t2009\t""\t24\t4\t0',
 '786188\tCoRR\t2014\t""\tabs/1409.0289\t""\t0']

In [154]:
authors = sc.textFile("/tmp/dblp_tsv/authors.tsv")

In [155]:
authors.count()

1691340

In [156]:
authors.first()

'1112044\tDennis Matthews'

In [157]:
authors.take(5)

['1112044\tDennis Matthews',
 '370385\tNeyre Tekbiyik',
 '416706\tRosa Manrique',
 '1308005\tWilliam Pisano',
 '577458\tDragan S. Popovic']