* [RDD API](http://nbviewer.ipython.org/github/jkthompson/pyspark-pictures/blob/master/pyspark-pictures.ipynb)
* [GitHub](https://github.com/jkthompson/pyspark-pictures)
* [related blog post](http://data-frack.blogspot.com/2015/01/visual-mnemonics-for-pyspark-api.html)

<img align=left src="files/images/pyspark-pictures-dataframes-page2.svg" width=500 height=500 />

---
> ###  Click on a picture to view pyspark docs

---

In [None]:
# versions
import IPython
print("pyspark version:" + str(sc.version))
print("Ipython version:" + str(IPython.__version__))

In [1]:
sc

<pyspark.context.SparkContext at 0x7fce82310dd0>

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [3]:
x = spark.createDataFrame([("Alice","Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
x.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+



<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.agg">
<img align=left src="files/images/pyspark-pictures-dataframes-page3.svg" width=500 height=500 />
</a>

In [None]:
x.agg?

In [4]:
# agg
y = x.agg({"amt":"avg"})

y.show()

+-------------------+
|           avg(amt)|
+-------------------+
|0.20000000000000004|
+-------------------+



<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.alias">
<img align=left src="files/images/pyspark-pictures-dataframes-page4.svg" width=750 height=750 />
</a>

In [None]:
x.alias?

In [5]:
# alias
from pyspark.sql.functions import col

y = x.alias('transactions')

y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+



In [6]:
y.select("to").show()

+-----+
|   to|
+-----+
|  Bob|
|Carol|
| Dave|
+-----+



In [7]:
y.select(col("transactions.to")).show()

+-----+
|   to|
+-----+
|  Bob|
|Carol|
| Dave|
+-----+



<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.cache">
<img align=left src="files/images/pyspark-pictures-dataframes-page5.svg" width=750 height=750 />
</a>

In [None]:
# cache
x.cache()

print(x.count()) # first action materializes x in memory
print(x.count()) # later actions avoid IO overhead

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.coalesce">
<img align=left src="files/images/pyspark-pictures-dataframes-page6.svg" width=750 height=750 />
</a>

In [None]:
x.rdd.take(3)

In [8]:
# coalesce
x_rdd = sc.parallelize([("Alice","Bob",0.1),
                        ("Bob","Carol",0.2),
                        ("Carol","Dave",0.3)], 2)

x = sqlContext.createDataFrame(x_rdd, ['from','to','amt'])

y = x.coalesce(numPartitions=1)

print(x.rdd.getNumPartitions())
print(y.rdd.getNumPartitions())

2
1


<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.collect">
<img align=left src="files/images/pyspark-pictures-dataframes-page7.svg" width=750 height=750 />
</a>

In [None]:
# collect
y = x.collect() # creates list of rows on driver

x.show()

print(y)
print type(y)
print type(x.rdd)

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.columns">
<img align=left src="files/images/pyspark-pictures-dataframes-page8.svg" width=500 height=500 />
</a>

In [None]:
# columns
y = x.columns 
#creates list of column names on driver

print(y)

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.corr">
<img align=left src="files/images/pyspark-pictures-dataframes-page9.svg" width=500 height=500 />
</a>

In [9]:
# corr
x = sqlContext.createDataFrame([("Alice","Bob",0.1,0.001),
                                ("Bob","Carol",0.2,0.02),
                                ("Carol","Dave",0.3,0.02)], schema=['from','to','amt','fee'])

y = x.corr(col1="amt",col2="fee")

print(y)
print(type(y))

0.866025403784
<type 'float'>


<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.count">
<img align=left src="files/images/pyspark-pictures-dataframes-page10.svg" width=500 height=500 />
</a>

In [None]:
# count

print(x.count())

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.cov">
<img align=left src="files/images/pyspark-pictures-dataframes-page11.svg" width=500 height=500 />
</a>

In [None]:
# cov
x = sqlContext.createDataFrame([("Alice","Bob",0.1,0.001),("Bob","Carol",0.2,0.02),("Carol","Dave",0.3,0.02)], ['from','to','amt','fee'])
y = x.cov(col1="amt",col2="fee")
x.show()
print(y)

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.crosstab">
<img align=left src="files/images/pyspark-pictures-dataframes-page12.svg" width=500 height=500 />
</a>

In [10]:
# crosstab
x = sqlContext.createDataFrame([("Alice","Bob",0.1),
                                ("Bob","Carol",0.2),
                                ("Carol","Dave",0.3)], ['from','to','amt'])
y = x.crosstab(col1='from', col2='to')
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-------+---+-----+----+
|from_to|Bob|Carol|Dave|
+-------+---+-----+----+
|    Bob|  0|    1|   0|
|  Alice|  1|    0|   0|
|  Carol|  0|    0|   1|
+-------+---+-----+----+



---

> ## TASK

1.
- Create a pandas dataframe with 2 columns called 'Actual', 'Predicted'.
- Fill it with True, False values at random (50 values)
- Convert it to a Spark Dataframe
- Use crosstab to find the confusion matrix

2.
Then, attempt the same, but this time, use RowRDDs to construct the DataFrame before using CrossTab

---

In [11]:
import pandas as pd
pdf_x = pd.DataFrame(zip(pd.Series(list('TF' * 100)).sample(50), pd.Series(list('TF' * 100)).sample(50)),
                     columns=['Actual', 'Predicted'])
pdf_x.head()

Unnamed: 0,Actual,Predicted
0,T,T
1,F,T
2,T,F
3,T,T
4,F,F


In [12]:
sdf_x = spark.createDataFrame(pdf_x)

In [13]:
sdf_x.crosstab(col1='Actual', col2='Predicted').show()

+----------------+---+---+
|Actual_Predicted|  F|  T|
+----------------+---+---+
|               F|  9| 13|
|               T| 15| 13|
+----------------+---+---+



## Using RowRDD

In [14]:
from pyspark.sql import Row

In [15]:
rdd_x = sc.parallelize(zip(pd.Series(list('TF' * 100)).sample(50).tolist(),
                           pd.Series(list('TF' * 100)).sample(50).tolist()))

In [16]:
rdd_x.take(3)

[('F', 'F'), ('T', 'F'), ('F', 'F')]

In [17]:
row_rdd_x = rdd_x.map(lambda x: Row(Actual = x[0], Predicted = x[1]))

In [18]:
row_rdd_x.take(2)

[Row(Actual='F', Predicted='F'), Row(Actual='T', Predicted='F')]

In [19]:
(spark
 .createDataFrame(row_rdd_x)
 .crosstab(col1='Actual', col2='Predicted')
 .show())

+----------------+---+---+
|Actual_Predicted|  F|  T|
+----------------+---+---+
|               F| 16|  9|
|               T| 11| 14|
+----------------+---+---+



<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.cube">
<img align=left src="files/images/pyspark-pictures-dataframes-page13.svg" width=500 height=500 />
</a>

In [None]:
x.show()

In [None]:
# cube
y = x.cube('from','to')

print(y)
# y is a grouped data object, aggregations will be applied to all numerical columns

In [None]:
y.sum().show() 
y.max().show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.describe">
<img align=left src="files/images/pyspark-pictures-dataframes-page14.svg" width=500 height=500 />
</a>

In [None]:
# describe
x.describe().show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.distinct">
<img align=left src="files/images/pyspark-pictures-dataframes-page15.svg" width=500 height=500 />
</a>

In [None]:
# distinct
y = x.distinct()
x.show()
y.show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.drop">
<img align=left src="files/images/pyspark-pictures-dataframes-page16.svg" width=500 height=500 />
</a>

In [20]:
# drop
# x = sqlContext.createDataFrame([("Alice","Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.drop('amt')
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+-----+
| from|   to|
+-----+-----+
|Alice|  Bob|
|  Bob|Carol|
|Carol| Dave|
+-----+-----+



<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.dropDuplicates">
<img align=left src="files/images/pyspark-pictures-dataframes-page17.svg" width=500 height=500 />
</a>

In [None]:
# dropDuplicates / drop_duplicates
x = sqlContext.createDataFrame([("Alice","Bob",0.1),("Bob","Carol",0.2),("Bob","Carol",0.3),("Bob","Carol",0.2)], ['from','to','amt'])
y = x.dropDuplicates(subset=['from','to'])
x.show()
y.show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.dropna">
<img align=left src="files/images/pyspark-pictures-dataframes-page18.svg" width=500 height=500 />
<

In [None]:
# dropna
x = sqlContext.createDataFrame([(None,"Bob",0.1),("Bob","Carol",None),("Carol",None,0.3),("Bob","Carol",0.2)], ['from','to','amt'])
y = x.dropna(how='any',subset=['from','to'])
x.show()
y.show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.dtypes">
<img align=left src="files/images/pyspark-pictures-dataframes-page19.svg" width=500 height=500 />
</a>

In [None]:
# dtypes
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.dtypes
x.show()
print(y)

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.explain">
<img align=left src="files/images/pyspark-pictures-dataframes-page20.svg" width=500 height=500 />
</a>

In [21]:
x.explain?

In [None]:
# explain
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
x.show()
x.agg({"amt":"avg"}).explain(extended = True)

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.fillna">
<img align=left src="files/images/pyspark-pictures-dataframes-page21.svg" width=500 height=500 />
</a>

In [None]:
# fillna
x = sqlContext.createDataFrame([(None,"Bob",0.1),("Bob","Carol",None),("Carol",None,0.3)], ['from','to','amt'])
y = x.fillna(value='Missing', subset=['from'])
x.show()
y.show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.filter">
<img align=left src="files/images/pyspark-pictures-dataframes-page22.svg" width=500 height=500 />
</a>

In [None]:
# filter
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.filter("amt > 0.1")
x.show()
y.show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.first">
<img align=left src="files/images/pyspark-pictures-dataframes-page23.svg" width=500 height=500 />
</a>

In [22]:
# first
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.first()
x.show()
print(y)

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

Row(from=u'Alice', to=u'Bob', amt=0.1)


<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.flatMap">
<img align=left src="files/images/pyspark-pictures-dataframes-page24.svg" width=500 height=500 />
</a>

In [23]:
x.map(lambda x: (x[0], x[1])).collect()

AttributeError: 'DataFrame' object has no attribute 'map'

In [None]:
x.flatMap(lambda x: (x[0], x[1])).collect()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.foreach">
<img align=left src="files/images/pyspark-pictures-dataframes-page25.svg" width=500 height=500 />
</a>

In [None]:
# foreach
from __future__ import print_function

# setup
fn = './foreachExampleDataFrames.txt' 
open(fn, 'w').close()  # clear the file
def fappend(el, f):
    '''appends el to file f'''
    print(el, file=open(f, 'a+') )

# example
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.foreach(lambda x: fappend(x, fn)) # writes into foreachExampleDataFrames.txt
x.show() # original dataframe
print(y) # foreach returns 'None'
# print the contents of the file
with open(fn, "r") as foreachExample:
    print (foreachExample.read())

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.foreachPartition">
<img align=left src="files/images/pyspark-pictures-dataframes-page26.svg" width=500 height=500 />
</a>

In [None]:
# foreachPartition
from __future__ import print_function

#setup
fn = './foreachPartitionExampleDataFrames.txt'
open(fn, 'w').close()  # clear the file
def fappend(partition,f):
    '''append all elements in partition to file f'''
    print([el for el in partition],file=open(f, 'a+'))

x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
x = x.repartition(2) # force 2 partitions
y = x.foreachPartition(lambda x: fappend(x,fn)) # writes into foreachPartitionExampleDataFrames.txt

x.show() # original dataframe
print(y) # foreach returns 'None'
# print the contents of the file
with open(fn, "r") as foreachExample:
    print (foreachExample.read())

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.freqItems">
<img align=left src="files/images/pyspark-pictures-dataframes-page27.svg" width=500 height=500 />
</a>

In [27]:
# freqItems
x = sqlContext.createDataFrame([("Bob","Carol",0.1), \
                                ("Alice","Dave",0.1), \
                                ("Alice","Bob",0.1), \
                                ("Alice","Bob",0.5), \
                                ("Carol","Bob",0.1), \
                               ("Carol","Bob",0.1), \
                               ("Carol","Bob",0.1)], \
                               ['from','to','amt'])
x.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|  Bob|Carol|0.1|
|Alice| Dave|0.1|
|Alice|  Bob|0.1|
|Alice|  Bob|0.5|
|Carol|  Bob|0.1|
|Carol|  Bob|0.1|
|Carol|  Bob|0.1|
+-----+-----+---+



In [29]:
y = x.freqItems(cols=['from','amt'], support=0.4)
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|  Bob|Carol|0.1|
|Alice| Dave|0.1|
|Alice|  Bob|0.1|
|Alice|  Bob|0.5|
|Carol|  Bob|0.1|
|Carol|  Bob|0.1|
|Carol|  Bob|0.1|
+-----+-----+---+

+--------------+-------------+
|from_freqItems|amt_freqItems|
+--------------+-------------+
|[Alice, Carol]|   [0.1, 0.5]|
+--------------+-------------+



<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.groupBy">
<img align=left src="files/images/pyspark-pictures-dataframes-page28.svg" width=500 height=500 />
</a>

In [None]:
# groupBy
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Alice","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.groupBy('from')
x.show()
print(y)

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.groupBy">
<img align=left src="files/images/pyspark-pictures-dataframes-page29.svg" width=500 height=500 />
</a>

In [None]:
# groupBy(col1).avg(col2)
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Alice","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.groupBy('from').avg('amt')
x.show()
y.show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.head">
<img align=left src="files/images/pyspark-pictures-dataframes-page30.svg" width=500 height=500 />
</a>

In [None]:
# head
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.head()
x.show()
print(y)

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.intersect">
<img align=left src="files/images/pyspark-pictures-dataframes-page31.svg" width=500 height=500 />
</a>

In [30]:
# intersect
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Alice",0.2),("Carol","Dave",0.1)], ['from','to','amt'])
z = x.intersect(y)
x.show()
y.show()
z.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Alice|0.2|
|Carol| Dave|0.1|
+-----+-----+---+

+-----+---+---+
| from| to|amt|
+-----+---+---+
|Alice|Bob|0.1|
+-----+---+---+



<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.isLocal">
<img align=left src="files/images/pyspark-pictures-dataframes-page32.svg" width=500 height=500 />
</a>

In [37]:
# isLocal
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),
                                ("Bob","Carol",0.2),("Carol","Dave",0.3)], 
                               ['from','to','amt'])
y = x.isLocal()
x.show()
print(y)

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

False


<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.join">
<img align=left src="files/images/pyspark-pictures-dataframes-page33.svg" width=500 height=500 />
</a>

In [None]:
# join
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = sqlContext.createDataFrame([('Alice',20),("Bob",40),("Dave",80)], ['name','age'])

x.show()
y.show()

In [None]:
z = x.join(y, x.to == y.name,'inner').select('from','to','amt','age')
z.show()

In [None]:
x.join(y, x.to == y.name,'left').select('from','to','amt','age').show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.limit">
<img align=left src="files/images/pyspark-pictures-dataframes-page34.svg" width=500 height=500 />
</a>

In [38]:
# limit
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.limit(2)
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
+-----+-----+---+



<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.na">
<img align=left src="files/images/pyspark-pictures-dataframes-page37.svg" width=500 height=500 />
</a>

In [40]:
# na
x = sqlContext.createDataFrame([(None,"Bob",0.1),("Bob","Carol",None),("Carol",None,0.3),("Bob","Carol",0.2)], ['from','to','amt'])
y = x.na  # returns an object for handling missing values, supports drop, fill, and replace methods

x.show()
print type(y)
print(y)

y.drop().show()
y.fill({'from':'unknown','to':'missing','amt':0}).show()


# Using fill with a numeric parameter will replace numeric missings, with a string will replace string missings

+-----+-----+----+
| from|   to| amt|
+-----+-----+----+
| null|  Bob| 0.1|
|  Bob|Carol|null|
|Carol| null| 0.3|
|  Bob|Carol| 0.2|
+-----+-----+----+

<class 'pyspark.sql.dataframe.DataFrameNaFunctions'>
<pyspark.sql.dataframe.DataFrameNaFunctions object at 0x7fce72019910>
+----+-----+---+
|from|   to|amt|
+----+-----+---+
| Bob|Carol|0.2|
+----+-----+---+

+-------+-------+---+
|   from|     to|amt|
+-------+-------+---+
|unknown|    Bob|0.1|
|    Bob|  Carol|0.0|
|  Carol|missing|0.3|
|    Bob|  Carol|0.2|
+-------+-------+---+



In [41]:
y.fill(0).show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
| null|  Bob|0.1|
|  Bob|Carol|0.0|
|Carol| null|0.3|
|  Bob|Carol|0.2|
+-----+-----+---+



In [42]:
y.fill('a').show()

+-----+-----+----+
| from|   to| amt|
+-----+-----+----+
|    a|  Bob| 0.1|
|  Bob|Carol|null|
|Carol|    a| 0.3|
|  Bob|Carol| 0.2|
+-----+-----+----+



<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.orderBy">
<img align=left src="files/images/pyspark-pictures-dataframes-page38.svg" width=500 height=500 />
</a>

In [None]:
# orderBy
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.orderBy(['from'], ascending=[False])
x.show()
y.show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.persist">
<img align=left src="files/images/pyspark-pictures-dataframes-page39.svg" width=500 height=500 />
</a>

In [None]:
x.persist?

In [43]:
# persist
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
x.persist(storageLevel=StorageLevel(True,True,False,True,1)) 
# StorageLevel(useDisk, useMemory, useOffHeap, deserialized, replication=1)

x.show()
x.is_cached

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+



True

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.printSchema">
<img align=left src="files/images/pyspark-pictures-dataframes-page40.svg" width=500 height=500 />
</a>

In [None]:
# printSchema
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
x.show()
x.printSchema()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.randomSplit">
<img align=left src="files/images/pyspark-pictures-dataframes-page41.svg" width=500 height=500 />
</a>

In [44]:
# randomSplit
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.randomSplit([0.5, 0.5])
type(y)

list

In [45]:
y[0].show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
+-----+-----+---+



In [46]:
y[1].show()

+-----+----+---+
| from|  to|amt|
+-----+----+---+
|Carol|Dave|0.3|
+-----+----+---+



In [47]:
[x.count() for x in sdf_x.randomSplit([0.3, 0.3, 0.4])]

[19, 16, 15]

In [48]:
[i.show() for i in y]

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
+-----+-----+---+

+-----+----+---+
| from|  to|amt|
+-----+----+---+
|Carol|Dave|0.3|
+-----+----+---+



[None, None]

In [None]:
sdf_x.rdd.take(2)

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.rdd">
<img align=left src="files/images/pyspark-pictures-dataframes-page42.svg" width=500 height=500 />
</a>

In [49]:
# rdd
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.rdd
x.show()
print(y.collect())

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

[Row(from=u'Alice', to=u'Bob', amt=0.1), Row(from=u'Bob', to=u'Carol', amt=0.2), Row(from=u'Carol', to=u'Dave', amt=0.3)]


<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.registerTempTable">
<img align=left src="files/images/pyspark-pictures-dataframes-page43.svg" width=500 height=500 />
</a>

In [50]:
# registerTempTable
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
x.registerTempTable(name="TRANSACTIONS")
y = sqlContext.sql('SELECT * FROM TRANSACTIONS WHERE amt > 0.1')
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+



<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.repartition">
<img align=left src="files/images/pyspark-pictures-dataframes-page44.svg" width=500 height=500 />
</a>

In [51]:
# repartition
x = sdf_x
y = x.repartition(3)
print(x.rdd.getNumPartitions())
print(y.rdd.getNumPartitions())

4
3


<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.replace">
<img align=left src="files/images/pyspark-pictures-dataframes-page45.svg" width=500 height=500 />
</a>

In [52]:
# replace
x = sqlContext.createDataFrame([('Dave',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.replace('Dave','David',['from','to'])
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
| Dave|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|David|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol|David|0.3|
+-----+-----+---+



<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.rollup">
<img align=left src="files/images/pyspark-pictures-dataframes-page46.svg" width=500 height=500 />
</a>

In [53]:
# rollup
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.rollup(['from','to'])
x.show()
print(y) # y is a grouped data object, aggregations will be applied to all numerical columns
y.sum().show()
y.max().show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

<pyspark.sql.group.GroupedData object at 0x7fce71f9e310>
+-----+-----+------------------+
| from|   to|          sum(amt)|
+-----+-----+------------------+
| null| null|0.6000000000000001|
|  Bob|Carol|               0.2|
|Carol| null|               0.3|
|Alice|  Bob|               0.1|
|  Bob| null|               0.2|
|Carol| Dave|               0.3|
|Alice| null|               0.1|
+-----+-----+------------------+

+-----+-----+--------+
| from|   to|max(amt)|
+-----+-----+--------+
| null| null|     0.3|
|  Bob|Carol|     0.2|
|Carol| null|     0.3|
|Alice|  Bob|     0.1|
|  Bob| null|     0.2|
|Carol| Dave|     0.3|
|Alice| null|     0.1|
+-----+-----+--------+



<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.sample">
<img align=left src="files/images/pyspark-pictures-dataframes-page47.svg" width=500 height=500 />
</a>

In [None]:
# sample

y = sdf_x.sample(False, 0.1)
sdf_x.show(5)
y.show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.sampleBy">
<img align=left src="files/images/pyspark-pictures-dataframes-page48.svg" width=500 height=500 />
</a>

In [None]:
# Used for Stratified Sampling

# sampleBy
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Alice","Carol",0.2),("Alice","Alice",0.3), \
                               ('Alice',"Dave",0.4),("Bob","Bob",0.5),("Bob","Carol",0.6)], \
                                ['from','to','amt'])
y = x.sampleBy(col='from',fractions={'Alice':0.1,'Bob':0.9})
x.show()
y.show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.schema>
<img align=left src="files/images/pyspark-pictures-dataframes-page49.svg" width=500 height=500 />
</a>

In [None]:
# schema
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.schema
x.show()
print(y)

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.select">
<img align=left src="files/images/pyspark-pictures-dataframes-page50.svg" width=500 height=500 />
</a>

In [None]:
# select
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.select(['from','amt'])
x.show()
y.show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.selectExpr">
<img align=left src="files/images/pyspark-pictures-dataframes-page51.svg" width=500 height=500 />
</a>

In [54]:
# selectExpr
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.selectExpr(['substr(from,1,1)','amt+10'])
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+---------------------+----------+
|substring(from, 1, 1)|(amt + 10)|
+---------------------+----------+
|                    A|      10.1|
|                    B|      10.2|
|                    C|      10.3|
+---------------------+----------+



<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.show">
<img align=left src="files/images/pyspark-pictures-dataframes-page52.svg" width=500 height=500 />
</a>

In [None]:
# show
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
x.show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.sort">
<img align=left src="files/images/pyspark-pictures-dataframes-page53.svg" width=500 height=500 />
</a>

In [None]:
# sort
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Alice",0.3)], ['from','to','amt'])
y = x.sort(['to'])
x.show()
y.show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.stat">
<img align=left src="files/images/pyspark-pictures-dataframes-page55.svg" width=500 height=500 />
</a>

In [None]:
# stat
x = sqlContext.createDataFrame([("Alice","Bob",0.1,0.001),("Bob","Carol",0.2,0.02),("Carol","Dave",0.3,0.02)], ['from','to','amt','fee'])
y = x.stat
x.show()
print(y)
print(y.corr(col1="amt",col2="fee"))

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.subtract">
<img align=left src="files/images/pyspark-pictures-dataframes-page56.svg" width=500 height=500 />
</a>

In [None]:
# subtract
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.1)], ['from','to','amt'])
z = x.subtract(y)
x.show()
y.show()
z.show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.take">
<img align=left src="files/images/pyspark-pictures-dataframes-page57.svg" width=500 height=500 />
</a>

In [None]:
# take
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.take(num=2)
x.show()
print(y)

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.toDF">
<img align=left src="files/images/pyspark-pictures-dataframes-page58.svg" width=500 height=500 />
</a>

In [55]:
# toDF
x = sc.parallelize([[1, 2],[3, 4],[5, 6]])
y = x.toDF(["A", "B"])
y.show()

+---+---+
|  A|  B|
+---+---+
|  1|  2|
|  3|  4|
|  5|  6|
+---+---+



<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.toJSON">
<img align=left src="files/images/pyspark-pictures-dataframes-page59.svg" width=500 height=500 />
</a>

In [56]:
# toJSON
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Alice",0.3)], ['from','to','amt'])
y = x.toJSON()
x.show()
print(y.collect())

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol|Alice|0.3|
+-----+-----+---+

[u'{"from":"Alice","to":"Bob","amt":0.1}', u'{"from":"Bob","to":"Carol","amt":0.2}', u'{"from":"Carol","to":"Alice","amt":0.3}']


<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.toPandas">
<img align=left src="files/images/pyspark-pictures-dataframes-page60.svg" width=500 height=500 />
</a>

In [None]:
# toPandas
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.toPandas()
x.show()
print(type(y))
y

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.unionAll">
<img align=left src="files/images/pyspark-pictures-dataframes-page61.svg" width=500 height=500 />
</a>

In [None]:
# unionAll
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2)], ['from','to','amt'])
y = sqlContext.createDataFrame([("Bob","Carol",0.2),("Carol","Dave",0.1)], ['from','to','amt'])
z = x.unionAll(y)
x.show()
y.show()
z.show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.unpersist">
<img align=left src="files/images/pyspark-pictures-dataframes-page62.svg" width=500 height=500 />
</a>

In [None]:
# unpersist
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
x.cache()
x.count()
x.show()
print(x.is_cached)
x.unpersist()
print(x.is_cached)

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where">
<img align=left src="files/images/pyspark-pictures-dataframes-page63.svg" width=500 height=500 />
</a>

In [None]:
# where (filter)
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.where("amt > 0.1")
x.show()
y.show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.withColumn">
<img align=left src="files/images/pyspark-pictures-dataframes-page64.svg" width=500 height=500 />
</a>

In [None]:
# withColumn
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",None),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.withColumn('conf',x.amt.isNotNull())
x.show()
y.show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.withColumnRenamed">
<img align=left src="files/images/pyspark-pictures-dataframes-page65.svg" width=500 height=500 />
</a>

In [None]:
# withColumnRenamed
x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.withColumnRenamed('amt','amount')
x.show()
y.show()

<a href="http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.write">
<img align=left src="files/images/pyspark-pictures-dataframes-page66.svg" width=500 height=500 />
</a>

In [None]:
[x for x in os.listdir(os.getcwd()) if 'json' in x]

In [None]:
# write
import json
# x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y2 = y.write.mode('overwrite').json('/Users/lr/Desktop/dataframeWriteExample.json')
# x.show()
# read the dataframe back in from file

In [None]:
y = sdf_x.repartition(8)

In [None]:
!head /Users/lr/Desktop/dataframeWriteExample.json/part-r-00000-75a53aa3-adba-4e61-a6a4-80ede8942b28.json

In [None]:
sqlContext.read.json('./dataframeWriteExample.json').show()