# PySpark basics
Based on [this post](https://medium.com/@MariumFaheem/big-data-with-pyspark-58e7ee2b1299).

PySpark cheat sheet [here](https://www.datacamp.com/blog/pyspark-cheat-sheet-spark-in-python).

In [None]:
!java --version

In [None]:
!pip install pyspark

In [None]:
from pyspark import SparkContext

### Connecting to Spark cluster

In [None]:
# Entry point for working with RDD
sc = SparkContext(appName = "pyspark-basics")

### Loading different types of data to RDD

In [None]:
# From a list
rdd_list = sc.parallelize([1, 2, 3, 4, 5])

In [None]:
# What is the data type?
type(rdd_list)

In [None]:
# Getting the number of partitions
rdd_list.getNumPartitions()

In [None]:
# Getting the list from the cluster
rdd_list.collect()

In [None]:
# Summing values
rdd_list.sum()

In [None]:
# Counting by value
rdd_list.countByValue()

In [None]:
# From a list of tuples
rdd_pair = sc.parallelize([
    ("jackets", 57),
    ("shirts", 33),
    ("jeans", 23),
    ("shirts", 23),
    ("jeans", 23),
    ("jeans", 13),
    ("jackets", 40)
])

In [None]:
# Counting by key
rdd_pair.countByKey()

In [None]:
# Printing the first record
rdd_pair.first()

In [None]:
# Loading a text file
rdd_text = sc.textFile("./data/poem.txt")

In [None]:
# Printing the first 5 records
rdd_text.take(5)

### MapReduce over RDD

In [None]:
# Using a map for multiplying each record
rdd_list_x2 = rdd_list.map(lambda x: x * 2)
rdd_list_x2.collect()

In [None]:
# Using a filter for getting a subset of the previously modified RDD
rdd_list_x2.filter(lambda x: x > 5).collect()

In [None]:
# Using reduceByKey for grouping by key and aggregating by value
rdd_pair_red = rdd_pair.reduceByKey(lambda x, y: x + y)
rdd_pair_red.collect()

In [None]:
# What if I want the result sorted by key
rdd_pair_red.sortByKey(ascending = True).collect()

In [None]:
# Using a map for splitting words from text
# By default, the result is something like a list of lists
rdd_text.map(lambda x: x.split(" ")).take(3)

In [None]:
# What if I want to get a flat structure => Use flatMap function
rdd_text.flatMap(lambda x: x.split(" ")).take(15)

### Building the wordcount in pySpark version

In [None]:
rdd_text.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .map(lambda x: (x[1], x[0])) \
    .sortByKey(ascending = False) \
    .map(lambda x: (x[1], x[0])) \
    .take(10)

### Stoping Spark context and session

In [None]:
sc.stop()