## Spark Basics

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SparkApp").getOrCreate()

24/04/08 20:19:32 WARN Utils: Your hostname, Abhijits-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.0.105 instead (on interface en0)
24/04/08 20:19:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/08 20:19:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


24/04/08 20:19:47 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [3]:
from pyspark.sql.types import Row
from datetime import datetime

In [5]:
sc = spark.sparkContext

In [8]:
## sc.parallelize can convert any list into a RDD

simple_data = sc.parallelize([1, "Alice", 50])
simple_data

ParallelCollectionRDD[1] at readRDDFromFile at PythonRDD.scala:289

In [9]:
simple_data.count()

                                                                                

3

In [10]:
# Gives first element -> action 
simple_data.first() 

1

In [11]:
## Gives first 2 elements
simple_data.take(2) 

[1, 'Alice']

In [12]:
### Gives all the element of the RDD:
simple_data.collect()

[1, 'Alice', 50]

In [13]:
### All the operations used till now are actions-> costly operations

In [38]:
### Converting an RDD to a DataFrame
records = sc.parallelize([[1, "Alice", 50],[2, "Bob", 80]])
df = records.toDF() #Possible as data in rows have matching datatype
df

DataFrame[_1: bigint, _2: string, _3: bigint]

In [17]:
df.show()

# Names the columns with 1,2 and 3.
# If we want column names we have to define with Row attribute

+---+-----+---+
| _1|   _2| _3|
+---+-----+---+
|  1|Alice| 50|
|  2|  Bob| 80|
+---+-----+---+



In [20]:
data = sc.parallelize([Row(id=1, name="Alice", score=50)]) # Row -> single record, parallelize takes list of row objects
df =data.toDF()

In [21]:
df.show()

+---+-----+-----+
| id| name|score|
+---+-----+-----+
|  1|Alice|   50|
+---+-----+-----+



In [23]:
## Data frame with list in it

complex_data = sc.parallelize([Row(
    col_float =1.44,
    col_integer= 10,
    col_string="John",
    col_bool=True,
    col_list=[1,2,3]
)])
complex_df=complex_data.toDF()

In [24]:
complex_df.show()

+---------+-----------+----------+--------+---------+
|col_float|col_integer|col_string|col_bool| col_list|
+---------+-----------+----------+--------+---------+
|     1.44|         10|      John|    true|[1, 2, 3]|
+---------+-----------+----------+--------+---------+



In [26]:
## Similarly the field can be list, dictionary, datetime and even another Row() object all are supported by data frames

## SQL Context

In [27]:
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)



In [28]:
df = sqlContext.range(5) ### Range(n)  provides list from 1 to n

In [30]:
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [31]:
### Creating data frame from SQLContext

data = [("Alice", 50),
        ("Bob", 80),
        ("Charles", 75)]

sqlContext.createDataFrame(data, ['Name','Score']).show()

+-------+-----+
|   Name|Score|
+-------+-----+
|  Alice|   50|
|    Bob|   80|
|Charles|   75|
+-------+-----+



## Using functions and accessing Dataframes

In [32]:
### Placing column name after formation of RDD

data = sc.parallelize([
    Row(1, "Alice", 50),
    Row(2, "Bob", 80)
])

In [33]:
column_names = Row('id', 'name', 'score')
students = data.map(lambda r: column_names(*r))

In [35]:
students.collect()

[Row(id=1, name='Alice', score=50), Row(id=2, name='Bob', score=80)]

In [36]:
#### map() -> most commmon transformation function -> map takes in a function and applies it to every element of an RDD
### -> it creates a new dataframe 

In [40]:
### We can collect individual cell content like this from a dataframe

val=df.collect()[1][2]
val

80

In [42]:
df.show()

+---+-----+---+
| _1|   _2| _3|
+---+-----+---+
|  1|Alice| 50|
|  2|  Bob| 80|
+---+-----+---+



In [41]:
df.rdd

### We can access a dataframe's internal RDD like this  

MapPartitionsRDD[78] at javaToPython at DirectMethodHandleAccessor.java:103

In [43]:
### Extracting specific columns of a dataframe using map

df.rdd.map(lambda x: ( x._2 )).collect()

['Alice', 'Bob']

In [44]:
### Extracting specific columns of a dataframe using select

df.select(
    '_1',
    '_2'
).show()

+---+-----+
| _1|   _2|
+---+-----+
|  1|Alice|
|  2|  Bob|
+---+-----+



In [46]:
### Adding a column to a dataframe

df.select(
    '_1',
    '_3'
)\
.withColumn(
    'col_sum',
    df._1 +df._3
).show()

+---+---+-------+
| _1| _3|col_sum|
+---+---+-------+
|  1| 50|     51|
|  2| 80|     82|
+---+---+-------+



In [47]:
### Renaming columns

df.withColumnRenamed("_3","_4").show()

+---+-----+---+
| _1|   _2| _4|
+---+-----+---+
|  1|Alice| 50|
|  2|  Bob| 80|
+---+-----+---+



## Spark and Pandas Dataframes

In [48]:
import pandas as pd

In [49]:
df_pandas = df.toPandas()

### Converting to pandas dataframe

In [51]:
df_pandas.head()


Unnamed: 0,_1,_2,_3
0,1,Alice,50
1,2,Bob,80


In [52]:
#### Creating spark Dataframe from pandas:

df_spark = sqlContext.createDataFrame(df_pandas).show()

+---+-----+---+
| _1|   _2| _3|
+---+-----+---+
|  1|Alice| 50|
|  2|  Bob| 80|
+---+-----+---+



24/04/09 10:30:25 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 975784 ms exceeds timeout 120000 ms
24/04/09 10:30:25 WARN SparkContext: Killing executors is not supported by current scheduler.
24/04/09 10:30:29 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$