<a href="https://colab.research.google.com/github/bonioloff/note_pyspark/blob/main/PySpark_Note.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Explanation on how Spark works:
- https://stackoverflow.com/questions/32621990/what-are-workers-executors-cores-in-spark-standalone-cluster

In [None]:
# install the pyspark package
!pip install pyspark

In [None]:
from pyspark import SparkConf, SparkContext

In [None]:
# Initialize spark context
conf = SparkConf().setMaster("local").setAppName("Spark App")
sc = SparkContext(conf=conf)

# Resilient Distributed Dataset (RDD) Interface

RDD can be used to store all types of data source:
- Manual entry
- textFile from local file, s3, hdfs, etc.
- Hive
- JDBC
- Cassandra
- HBase
- ElasticSearch
- JSON, etc

In [None]:
# Manual way to create RDD
rdd = sc.parallelize([1,2,3,4])

In [None]:
# Create RDD from text file
rdd = sc.textFile("sample_data/california_housing_test.csv")

In [None]:
# Show the first 5 rows
rdd.take(5)
# rdd.collect()

['"longitude","latitude","housing_median_age","total_rooms","total_bedrooms","population","households","median_income","median_house_value"',
 '-122.050000,37.370000,27.000000,3885.000000,661.000000,1537.000000,606.000000,6.608500,344700.000000',
 '-118.300000,34.260000,43.000000,1510.000000,310.000000,809.000000,277.000000,3.599000,176500.000000',
 '-117.810000,33.780000,27.000000,3589.000000,507.000000,1484.000000,495.000000,5.793400,270500.000000',
 '-118.360000,33.820000,28.000000,67.000000,15.000000,49.000000,11.000000,6.135900,330000.000000']

In [None]:
# flatMap()
rdd.flatMap(lambda x: x.split(",")).take(10)

['"longitude"',
 '"latitude"',
 '"housing_median_age"',
 '"total_rooms"',
 '"total_bedrooms"',
 '"population"',
 '"households"',
 '"median_income"',
 '"median_house_value"',
 '-122.050000']

## Basic Operations with RDD
These are the basic operations in rdd: 
- map, 
- flatmap, 
- filter, 
- distinct, 
- sample, 
- union, 
- intersection, 
- subtract, 
- cartesian

_bykey_:
- reduceByKey()
- groupByKey()
- sortByKey()
- keys()
- values()

In [None]:
# map()
income_vs_homevalue = rdd.map(lambda x: x.split(",")).map(lambda x: (x[-2], x[-1]))
income_vs_homevalue.take(5)

[('"median_income"', '"median_house_value"'),
 ('6.608500', '344700.000000'),
 ('3.599000', '176500.000000'),
 ('5.793400', '270500.000000'),
 ('6.135900', '330000.000000')]

In [None]:
# reduceByKey()
income_vs_homevalue.reduceByKey(lambda x, y: x + y).take(5)

[('"median_income"', '"median_house_value"'),
 ('6.608500', '344700.000000'),
 ('3.599000', '176500.000000'),
 ('5.793400', '270500.000000'),
 ('6.135900', '330000.000000106300.000000225000.000000')]

In [None]:
# keys()
income_vs_homevalue.keys().take(5)

['"median_income"', '6.608500', '3.599000', '5.793400', '6.135900']

In [None]:
# values()
income_vs_homevalue.values().take(5)

['"median_house_value"',
 '344700.000000',
 '176500.000000',
 '270500.000000',
 '330000.000000']

In [None]:
# sortByKey()
income_vs_homevalue.sortByKey().take(5)

[('"median_income"', '"median_house_value"'),
 ('0.499900', '500001.000000'),
 ('0.536000', '162500.000000'),
 ('0.536000', '275000.000000'),
 ('0.536000', '87500.000000')]

# SparkSQL: DataFrame (DF) / Datasets

Unlike RDD, DataFrame has some benefits:
- Contains row objects
- Can run sql
- Can have schema
- Can communicate JDBC and ODBC and Tableau

Instead using SparkContext, DF lives in SparkSession.

__Datasets__ are typed Dataframe, it's used in typed programming language like Scala and Java, but in Python that is untyped we can use DataFrame.

Typed means we have to specifically tell what is the data type of the columns.


In [None]:
from pyspark.sql import SparkSession, Row

In [None]:
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [None]:
inputData = spark.read.csv("sample_data/california_housing_test.csv", header=True, inferSchema=True)

In [63]:
inputData.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)



In [None]:
inputData.select("longitude").show(5)

+---------+
|longitude|
+---------+
|  -122.05|
|   -118.3|
|  -117.81|
|  -118.36|
|  -119.67|
+---------+
only showing top 5 rows



In [None]:
inputData.filter(inputData.total_rooms > 1000).show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -122.05|   37.37|              27.0|     3885.0|         661.0|    1537.0|     606.0|       6.6085|          344700.0|
|   -118.3|   34.26|              43.0|     1510.0|         310.0|     809.0|     277.0|        3.599|          176500.0|
|  -117.81|   33.78|              27.0|     3589.0|         507.0|    1484.0|     495.0|       5.7934|          270500.0|
|  -119.67|   36.33|              19.0|     1241.0|         244.0|     850.0|     237.0|       2.9375|           81700.0|
|  -119.56|   36.51|              37.0|     1018.0|         213.0|     663.0|     204.0|       1.6635|           67000.0|
+---------+--------+----

In [None]:
inputData.groupBy("households").count().show(5)

+----------+-----+
|households|count|
+----------+-----+
|     305.0|    4|
|     558.0|    2|
|     496.0|    1|
|     596.0|    5|
|     299.0|    4|
+----------+-----+
only showing top 5 rows



In [62]:
inputData.select(inputData.households, inputData.median_income * 0.8).show()

+----------+---------------------+
|households|(median_income * 0.8)|
+----------+---------------------+
|     606.0|               5.2868|
|     277.0|   2.8792000000000004|
|     495.0|    4.634720000000001|
|      11.0|    4.908720000000001|
|     237.0|                 2.35|
|     204.0|               1.3308|
|     218.0|              1.33128|
|     441.0|                 2.58|
|     599.0|              2.93568|
|     603.0|              1.86664|
|     261.0|              1.76432|
|     138.0|   1.9333600000000002|
|     170.0|   3.7520000000000007|
|     659.0|   3.6500000000000004|
|     331.0|    4.569680000000001|
|      50.0|   1.7600000000000002|
|     107.0|                  1.5|
|     595.0|   2.1739200000000003|
|     199.0|              5.26808|
|    1258.0|              4.93792|
+----------+---------------------+
only showing top 20 rows

