# Introduction to PySpark RDD

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

### RDDs from Parallelized collections

In [5]:
# Create an RDD from a list of words
RDD = sc.parallelize(["Spark", "is", "a", "framework", "for", "Big Data processing"])

# Print out the type of the created object
print("The type of RDD is", type(RDD))

The type of RDD is <class 'pyspark.rdd.RDD'>


### RDDs from External Datasets

In [6]:
file_path = 'planes.csv'

# Print the file_path
print("The file_path is", file_path)

# Create a fileRDD from file_path
fileRDD = sc.textFile(file_path)

# Check the type of fileRDD
print("The file type of fileRDD is", type(fileRDD))

The file_path is planes.csv
The file type of fileRDD is <class 'pyspark.rdd.RDD'>


In [7]:
# Check the number of partitions in fileRDD
print("Number of partitions in fileRDD is", fileRDD.getNumPartitions())

# Create a fileRDD_part from file_path with 5 partitions
fileRDD_part = sc.textFile(file_path, minPartitions = 5)

# Check the number of partitions in fileRDD_part
print("Number of partitions in fileRDD_part is", fileRDD_part.getNumPartitions())

Number of partitions in fileRDD is 2
Number of partitions in fileRDD_part is 5


### Lambda Map & Filter

In [8]:
my_list = [1,2,3,4,5,6,7,8,9,10]

# Print my_list in the console
print("Input list is", my_list)

# Square all numbers in my_list
squared_list_lambda = list(map(lambda x: x ** 2, my_list))

# Print the result of the map function
print("The squared numbers are", squared_list_lambda)

Input list is [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
The squared numbers are [1, 4, 9, 16, 25, 36, 49, 64, 81, 100]


In [10]:
my_list2 = [10, 21, 31, 40, 51, 60, 72, 80, 93, 101]

# Print my_list2 in the console
print("Input list is:", my_list2)

# Filter numbers divisible by 10
filtered_list = list(filter(lambda x: (x%10 == 0), my_list2))

# Print the numbers divisible by 10
print("Numbers divisible by 10 are:", filtered_list)

Input list is: [10, 21, 31, 40, 51, 60, 72, 80, 93, 101]
Numbers divisible by 10 are: [10, 40, 60, 80]


### Map and Collect

In [17]:
numbRDD = sc.parallelize(my_list)

# Create map() transformation to cube numbers
cubedRDD = numbRDD.map(lambda x: x**3)

# Collect the results
numbers_all = cubedRDD.collect()

# Print the numbers from numbers_all
for numb in numbers_all:
	print(numb)


1
8
27
64
125
216
343
512
729
1000


### Filter and Count

In [20]:
# Filter the fileRDD to select lines with Spark keyword
fileRDD_filter = fileRDD.filter(lambda line: 'AIRBUS' in line)

# How many lines are there in fileRDD?
print("The total number of lines with the keyword AIRBUS is", fileRDD_filter.count())

# Print the first four lines of fileRDD
for line in fileRDD_filter.take(5): 
  print(line)

The total number of lines with the keyword AIRBUS is 798
"N102UW",1998,"Fixed wing multi engine","AIRBUS INDUSTRIE","A320-214",2,182,NA,"Turbo-fan"
"N103US",1999,"Fixed wing multi engine","AIRBUS INDUSTRIE","A320-214",2,182,NA,"Turbo-fan"
"N104UW",1999,"Fixed wing multi engine","AIRBUS INDUSTRIE","A320-214",2,182,NA,"Turbo-fan"
"N105UW",1999,"Fixed wing multi engine","AIRBUS INDUSTRIE","A320-214",2,182,NA,"Turbo-fan"
"N107US",1999,"Fixed wing multi engine","AIRBUS INDUSTRIE","A320-214",2,182,NA,"Turbo-fan"
