## Connect to Spark standalone cluster

In [2]:
%run 0.connect-to-spark-cluster.ipynb

localhost: stopping org.apache.spark.deploy.worker.Worker
stopping org.apache.spark.deploy.master.Master
starting org.apache.spark.deploy.master.Master, logging to /usr/local/spark/logs/spark--org.apache.spark.deploy.master.Master-1-sparkc.out
localhost: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-root-org.apache.spark.deploy.worker.Worker-1-sparkc.out
162 NameNode
867 NodeManager
5027 Master
740 ResourceManager
502 SecondaryNameNode
311 DataNode
5131 Worker
5212 Jps
3948 SparkSubmit
no sc/spark to stop


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/07/29 03:30:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/07/29 03:30:55 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
spark

## RDDs

In [4]:
data = [1, 2, 3, 4, 5]
data_rdd = sc.parallelize(data)

In [5]:
type(data_rdd)

pyspark.rdd.RDD

In [6]:
dbl_distData = data_rdd.map(lambda x: x*2)
dbl_distData.collect()

                                                                                

[2, 4, 6, 8, 10]

In [None]:
from datetime import datetime, date
from pyspark.sql import Row

rows = [
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
]

rows_rdd = sc.parallelize(rows)


In [None]:
type(rows_rdd)

In [None]:
range_rdd = sc.parallelize(range(6))

In [None]:
type(range_rdd)

In [None]:
cube_range_rdd = range_rdd.map(lambda x: x*3)
cube_range_rdd.collect()

In [None]:
type(rows_rdd)

## DataFrame

### Infer schema

In [None]:
print(rows)
print(type(rows))

In [None]:
print(rows_rdd)
print(type(rows_rdd))

In [None]:
rows_df = spark.createDataFrame(rows)

In [None]:
rows_rdd_df = spark.createDataFrame(rows_rdd)

In [None]:
print(rows_df)
print(type(rows_df))

In [None]:
print(rows_rdd_df)
print(type(rows_rdd_df))

In [None]:
data

In [None]:
data_df = spark.createDataFrame(data_rdd)

### Explicit schema

In [5]:
from datetime import datetime, date

rdd = spark.sparkContext.parallelize([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
])
df = spark.createDataFrame(rdd, schema=['a', 'b', 'c', 'd', 'e'])
df

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [6]:
df.show()
df.printSchema()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [7]:
df.show(2, vertical=True)

-RECORD 0------------------
 a   | 1                   
 b   | 2.0                 
 c   | string1             
 d   | 2000-01-01          
 e   | 2000-01-01 12:00:00 
-RECORD 1------------------
 a   | 2                   
 b   | 3.0                 
 c   | string2             
 d   | 2000-02-01          
 e   | 2000-01-02 12:00:00 
only showing top 2 rows



### examine dataframe api

In [8]:
df.columns

['a', 'b', 'c', 'd', 'e']

In [9]:
df.show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+



In [10]:
df.collect()

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)),
 Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0)),
 Row(a=3, b=4.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))]

In [11]:
df.explain()

== Physical Plan ==
*(1) Scan ExistingRDD[a#0L,b#1,c#2,d#3,e#4]




In [12]:
df.schema

StructType([StructField('a', LongType(), True), StructField('b', DoubleType(), True), StructField('c', StringType(), True), StructField('d', DateType(), True), StructField('e', TimestampType(), True)])

In [13]:
df.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [14]:
df.summary()

DataFrame[summary: string, a: string, b: string, c: string]

In [15]:
df.take(2)

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)),
 Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0))]

### examine the data

In [None]:
!ls /data

In [None]:
!ls /data/flight-data

In [None]:
!ls /data/flight-data/csv/

In [None]:
!cat /data/flight-data/csv/2010-summary.csv

In [None]:
!head /data/flight-data/csv/2010-summary.csv

### create the dataframe

In [None]:
flightDF = spark.read.option("header", "true").csv("file:///data/flight-data/csv/2010-summary.csv")