# Misc. Activities in PySpark
To follow along with: https://realpython.com/pyspark-intro/

In [1]:
# Filter, Map and Reduce are some main "Functional tools" in Python.
# These concepts are really important before moving into PySpark.

from functools import reduce
x = ['Python', 'programming', 'is', 'awesome!']

print('Sorted:', sorted(x, key=lambda arg: arg.lower()))
print('Filter:', list(filter(lambda arg: len(arg) < 8, x)))
print('Mapped:', list(map(lambda arg: arg.upper(), x)))
print('Reduce:', reduce(lambda val1, val2: val1 + val2, x))

# Note that Filter and Map are more efficient alternatives to iterating thru a for loop and appending.
# Also note: The 1st 3 return new iterables. Reduce, however, returns 1 reduced value.

Sorted: ['awesome!', 'is', 'programming', 'Python']
Filter: ['Python', 'is']
Mapped: ['PYTHON', 'PROGRAMMING', 'IS', 'AWESOME!']
Reduce: Pythonprogrammingisawesome!


In [19]:
# Reading in a csv (from my court cases project)
# And doing some basic operations with it

import findspark
findspark.init() # Need to use this to find the pyspark library before importing

import pyspark

sc = pyspark.SparkContext.getOrCreate('local[*]')
# A note on this ^. 'Local' tells Spark we're on a single local machine.
# * tells Spark to create as many worker threads as logical cores on the machine.

txt = sc.textFile('cases.csv')
print(txt.count())

python_lines = txt.filter(lambda line: 'william' in line.lower())
print(python_lines.count())

7735
93


In [23]:
# The RDD command distributes the dataset into 2 partitions
# Also note that we're using .filter here, but it's an RDD method (still used the same way tho)
# Take returns a subset, a bit like pd.head()
big_list = range(10000)
rdd = sc.parallelize(big_list, 2)
odds = rdd.filter(lambda x: x % 2 != 0)
odds.take(5)

[1, 3, 5, 7, 9]