In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))


# Create an RDD of tuples

## setup `SparkContext`

In [4]:
def get_sc():
    """get a SparkContext (don't recreate)"""
    import pyspark

    # don't try to redefine 'sc'
    if not globals().get('sc', False):
        sc = pyspark.SparkContext('local', 'ch3 notebook')
    else:
        print('not redefining sc')
        sc = globals()['sc']
    return sc

In [3]:
sc = get_sc()

In [5]:
data_names_ages = [('Brooke', 20),
                   ('Denny', 31),
                   ('Jules', 30),
                   ('TD', 35),
                   ('Brooke', 25)]

dataRDD = sc.parallelize(data_names_ages)

In [6]:
# Use map and reduceByKey transformations with their 
# lambda expressions to aggregate and then compute average

agesRDD = (dataRDD.map(lambda x, y: (x, (y, 1)))
           .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
           .map(lambda x, y, z: (x, y/z))
          )

In [8]:
# Same thing, but with a dataframe

from pyspark.sql.functions import avg
from pyspark.sql import SparkSession

# create a SparkSession
spark = (SparkSession
    .builder
    .appName("example")
    .getOrCreate())

data_df = spark.createDataFrame(data_names_ages, ['name', 'age'])

# group names, aggregate age, get avg
avg_df = data_df.groupBy('name').agg(avg('age'))


avg_df.show()

+------+--------+
|  name|avg(age)|
+------+--------+
|Brooke|    22.5|
| Jules|    30.0|
|    TD|    35.0|
| Denny|    31.0|
+------+--------+



In [9]:
spark

# Create DataFrame with schema

see file `ex3-6-define-schema.py`

run it using either

```
spark-submit ex3-6-define-schema.py
```

or 

```
python ex3-6-define-schema.py
```

In [10]:
# In Python 
from pyspark.sql.types import *
from pyspark.sql import SparkSession

# define schema for our data using DDL 
schema = "`Id` INT,`First` STRING,`Last` STRING,`Url` STRING,`Published` STRING,`Hits` INT,`Campaigns` ARRAY<STRING>"
# create our static data
data = [
    [1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
    [2, "Brooke","Wenig","https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
    [3, "Denny", "Lee", "https://tinyurl.3","6/7/2019",7659, ["web", "twitter", "FB", "LinkedIn"]],
    [4, "Tathagata", "Das","https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
    [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
    [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]
    ]


In [11]:
# create a SparkSession
spark = (SparkSession
    .builder
    .appName("Example-3_6")
    .getOrCreate())

Learning Spark, 2nd ed ch 3

In [15]:
# create a DataFrame using the schema defined above
blogs_df = spark.createDataFrame(data, schema)


In [16]:
blogs_df.schema

StructType(List(StructField(Id,IntegerType,true),StructField(First,StringType,true),StructField(Last,StringType,true),StructField(Url,StringType,true),StructField(Published,StringType,true),StructField(Hits,IntegerType,true),StructField(Campaigns,ArrayType(StringType,true),true)))

In [17]:
# these types are defined in spark.sql.types
scm = StructType(
    [StructField('Id', IntegerType(), True),
         StructField('First', StringType(), True),
         StructField('Last', StringType(), True),
         StructField('Url', StringType(), True),
         StructField('Published', StringType(), True),
         StructField('Hits', IntegerType(), True),
         StructField('Campaigns', ArrayType(StringType(), True), True)])

In [18]:
blogs2_df = spark.createDataFrame(data, scm)
blogs2_df.show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [55]:
blogs_df.columns

['Id', 'First', 'Last', 'Url', 'Published', 'Hits', 'Campaigns']

In [54]:
blogs_df.selectExpr('Hits * 2 as double','Hits').show()

+------+-----+
|double| Hits|
+------+-----+
|  9070| 4535|
| 17816| 8908|
| 15318| 7659|
| 21136|10568|
| 81156|40578|
| 51136|25568|
+------+-----+



In [21]:
import pyspark.sql.functions as F

In [22]:
blogs2_df.select(F.col('Hits') * 2, 'Hits', F.col('Hits') - 1).show()

+----------+-----+----------+
|(Hits * 2)| Hits|(Hits - 1)|
+----------+-----+----------+
|      9070| 4535|      4534|
|     17816| 8908|      8907|
|     15318| 7659|      7658|
|     21136|10568|     10567|
|     81156|40578|     40577|
|     51136|25568|     25567|
+----------+-----+----------+



In [23]:
blogs2_df.withColumn('Big Hitters', (F.expr('Hits > 10000'))).show()

+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|Big Hitters|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|      false|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|      false|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|      false|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|       true|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|       true|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|       true|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+



In [24]:
(blogs2_df
 .withColumn('AuthorsId', 
             (F.concat(F.expr('First'),
                       F.expr('Last'), 
                       F.expr('Id'))))
 .select('AuthorsId')
 .show(n=4))

+-------------+
|    AuthorsId|
+-------------+
|  JulesDamji1|
| BrookeWenig2|
|    DennyLee3|
|TathagataDas4|
+-------------+
only showing top 4 rows



## Using `expr`

In [56]:
from pyspark.sql.functions import expr

### use `expr` to compute a value

In [60]:
blogs_df.select(expr('Hits * 2')).show(2)

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
+----------+
only showing top 2 rows



### or use `col` to compute value

In [61]:
blogs_df.select(col('Hits') * 2).show(2)

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
+----------+
only showing top 2 rows



### Add new columns using `withColumn` and `expr`

In [65]:
(blogs_df
 .withColumn('Big Hitters', expr('Hits > 10000'))
 .toPandas()
#  .show()
)

Unnamed: 0,Id,First,Last,Url,Published,Hits,Campaigns,Big Hitters
0,1,Jules,Damji,https://tinyurl.1,1/4/2016,4535,"[twitter, LinkedIn]",False
1,2,Brooke,Wenig,https://tinyurl.2,5/5/2018,8908,"[twitter, LinkedIn]",False
2,3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,"[web, twitter, FB, LinkedIn]",False
3,4,Tathagata,Das,https://tinyurl.4,5/12/2018,10568,"[twitter, FB]",True
4,5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,"[web, twitter, FB, LinkedIn]",True
5,6,Reynold,Xin,https://tinyurl.6,3/2/2015,25568,"[twitter, LinkedIn]",True


### use `expr` to concatenate columns

In [71]:
from pyspark.sql.functions import concat

(blogs_df
 .withColumn('AuthorsId', 
             concat(expr('First'), expr('Last'), expr('Id')))
#  .select(expr('AuthorsId'))
 .select('AuthorsId')
 .show() 
)

+-------------+
|    AuthorsId|
+-------------+
|  JulesDamji1|
| BrookeWenig2|
|    DennyLee3|
|TathagataDas4|
|MateiZaharia5|
|  ReynoldXin6|
+-------------+



## 4 ways to do the same thing

In [52]:
blogs_df.show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [53]:
blogs2_df.show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [73]:
from pyspark.sql.functions import column

In [74]:
blogs_df.select('Hits').show(2)
blogs_df.select(expr('Hits')).show(2)

print('"col" is short for "column"')
blogs_df.select(col('Hits')).show(2)
blogs_df.select(column('Hits')).show(2)

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows

"col" is short for "column"
+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows



## Sort by `Id`

In [31]:
from pyspark.sql.functions import col

In [75]:
blogs_df.sort(col('Id').desc()).show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [79]:
from pyspark.sql.functions import when

In [87]:
(blogs_df
    .withColumn('new',
               when(blogs_df['Id'].isin([1,3,5]),
                    col('First'))
               .otherwise('*' * 8))
.show())

+---+---------+-------+-----------------+---------+-----+--------------------+--------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|     new|
+---+---------+-------+-----------------+---------+-----+--------------------+--------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|   Jules|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|********|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|   Denny|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|********|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|   Matei|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|********|
+---+---------+-------+-----------------+---------+-----+--------------------+--------+



NOTE: Ch 3 says that `$` is a spark function that converts something to a column. However, it doesn't work here

In [42]:
(blogs2_df
 .filter('`Id` < 6')
 .filter('Id > 2')
 .sort(col('Id').desc())
 .show())

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [48]:
# `where` and `filter` are the same

In [44]:
# use plain SQL in `filter`
(blogs2_df
 .filter('`Id` < 6')
 .filter('Id > 2')
 .filter("Last like 'Z%'")
 .sort(col('Id').desc())
 .show())

+---+-----+-------+-----------------+---------+-----+--------------------+
| Id|First|   Last|              Url|Published| Hits|           Campaigns|
+---+-----+-------+-----------------+---------+-----+--------------------+
|  5|Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
+---+-----+-------+-----------------+---------+-----+--------------------+



In [47]:
# use plain SQL in `filter`
(blogs2_df
 .filter('`Id` < 6')
 .filter('Id > 2')
 .where("Last like 'Z%'")
 .sort(col('Id').desc())
 .show())

+---+-----+-------+-----------------+---------+-----+--------------------+
| Id|First|   Last|              Url|Published| Hits|           Campaigns|
+---+-----+-------+-----------------+---------+-----+--------------------+
|  5|Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
+---+-----+-------+-----------------+---------+-----+--------------------+



# Rows

## Instantiate a row

In [91]:
# In Python
from pyspark.sql import Row

blog_row = Row(6, "Reynold", "Xin", "https://tinyurl.6", 255568, 
               "3/2/2015", ["twitter", "LinkedIn"])

In [98]:
# access using index for individual items
print(f"index 0: {blog_row[0]}")
print(f"index 1: {blog_row[1]}")

print()

for item in blog_row:
    print(item)

index 0: 6
index 1: Reynold

6
Reynold
Xin
https://tinyurl.6
255568
3/2/2015
['twitter', 'LinkedIn']


In [95]:
len(blog_row)

7

## Row objects can be used to create DataFrames if you need them for quick interactivity and exploration. 

In [106]:
# In Python 
from pyspark.sql import Row
from pyspark.sql.types import *

In [107]:
# using DDL String to define a schema
schema = "`Author` STRING, `State` STRING"
rows = [Row("Matei Zaharia", "CA"), Row("Reynold Xin", "CA")]

In [108]:
authors_df = spark.createDataFrame(rows, schema)
authors_df.show()

+-------------+-----+
|       Author|State|
+-------------+-----+
|Matei Zaharia|   CA|
|  Reynold Xin|   CA|
+-------------+-----+



In [115]:
schema2 = """
    author string, 
    state string
    """
spark.createDataFrame(rows, schema2).show()

+-------------+-----+
|       author|state|
+-------------+-----+
|Matei Zaharia|   CA|
|  Reynold Xin|   CA|
+-------------+-----+



In [111]:
spark.createDataFrame(rows, ['author', 'state']).show()

+-------------+-----+
|       author|state|
+-------------+-----+
|Matei Zaharia|   CA|
|  Reynold Xin|   CA|
+-------------+-----+



## drop a column

In [None]:
authors_df.drop('State').show()

# Common DataFrame Operations

In [117]:
import os

In [123]:
spark_dir = '/Users/bartev/dev/spark-3.0.0-preview2-bin-hadoop2.7'
people_file_relative = 'examples/src/main/resources/people.csv'
people_file = os.path.join(spark_dir, people_file_relative)

In [124]:
people_file

'/Users/bartev/dev/spark-3.0.0-preview2-bin-hadoop2.7/examples/src/main/resources/people.csv'

In [125]:
from pyspark.sql.types import *

## Programmatic way to define a schema

In [126]:
people_schema = StructType(
    [StructField('name', StringType(), True),
     StructField('age', IntegerType(), True),
     StructField('job', StringType(), True)])

## infer schema from a smaller sample

In [133]:
(spark
 .read
 .option('samplingRatio', 0.5)
 .option('header', 'true')
 .csv(people_file)
 .show())

+------------------+
|      name;age;job|
+------------------+
|Jorge;30;Developer|
|  Bob;32;Developer|
+------------------+



## read the file using DataFrameReader using format csv

In [128]:
people_df = spark.read.csv(people_file, 
                           header=True, 
                           schema=people_schema, 
                           sep=';')

### Can separate out options

In [143]:
(spark
 .read
 .option('header', 'true')
 .option('schema', people_schema)
 .option('sep', ';')
 .csv(people_file)
 .show())

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
+-----+---+---------+



### option `True` or `"true"` - both ok

In [146]:
(spark
 .read
 .option('header', True)
 .option('schema', people_schema)
 .option('sep', ';')
 .csv(people_file)
 .show())

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
+-----+---+---------+



In [129]:
people_df.show()

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
+-----+---+---------+



## `DataFrameWriter`

Parquet

* default format
* uses compression
* preserves schema as part of metadata

In [149]:
people_tbl = (people_df
              .write
              .format('parquet')
              .mode('overwrite')
              .save('people.parquet'))

In [151]:
spark.read.parquet('people.parquet').show()

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
+-----+---+---------+



In [155]:
parquet_table = 'people_tbl'
(people_df.write
    .format('parquet')
    .mode('overwrite')
    .saveAsTable(parquet_table))

## Projections and filters

* projection
    * `select()`
* filter
    * `filter()`
    * `where()`

In [156]:
people_df = spark.read.csv(people_file, header=True, 
                           schema=people_schema, sep=';')

In [157]:
people_df.show()

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
+-----+---+---------+



### sql like

In [162]:
(people_df
 .select('age', 'name')
 .where('age != 30')
 .show())

+---+----+
|age|name|
+---+----+
| 32| Bob|
+---+----+



### use `col`

In [161]:
(people_df
 .select('age', 'name')
 .where(col('age') > 30)
 .show())

+---+----+
|age|name|
+---+----+
| 32| Bob|
+---+----+



### use an `expr`

In [160]:
(people_df
 .select('age', 'name')
 .where(expr('age > 30'))
 .show())

+---+----+
|age|name|
+---+----+
| 32| Bob|
+---+----+



## Load movies

In [165]:
movie_fname = '/Users/bartev/dev/github-bv/san-tan/lrn-spark/Data-ML-100k--master/ml-100k/u.item'

In [166]:
movies_df = spark.read.csv(movie_fname, header=False, sep='|')

In [167]:
movies_df.show(3)

+---+-----------------+-----------+----+--------------------+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|_c0|              _c1|        _c2| _c3|                 _c4|_c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|
+---+-----------------+-----------+----+--------------------+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|  1| Toy Story (1995)|01-Jan-1995|null|http://us.imdb.co...|  0|  0|  0|  1|  1|   1|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|
|  2| GoldenEye (1995)|01-Jan-1995|null|http://us.imdb.co...|  0|  1|  1|  0|  0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   1|   0|   0|
|  3|Four Rooms (1995)|01-Jan-1995|null|http://us.imdb.co...|  0|  0|  0|  0|  0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   1|   0|   0|
+---+-----------------+-----------+----+--------------------+---+---+---+---+---+----+--

## Rename columns

In [169]:
movies = (movies_df
 .select('_c0', '_c1', '_c2', '_c4')
 .withColumnRenamed('_c0', 'id')
 .withColumnRenamed('_c1', 'title')
 .withColumnRenamed('_c2', 'date')
 .withColumnRenamed('_c4', 'url')
)
movies.show(5)

+---+-----------------+-----------+--------------------+
| id|            title|       date|                 url|
+---+-----------------+-----------+--------------------+
|  1| Toy Story (1995)|01-Jan-1995|http://us.imdb.co...|
|  2| GoldenEye (1995)|01-Jan-1995|http://us.imdb.co...|
|  3|Four Rooms (1995)|01-Jan-1995|http://us.imdb.co...|
|  4|Get Shorty (1995)|01-Jan-1995|http://us.imdb.co...|
|  5|   Copycat (1995)|01-Jan-1995|http://us.imdb.co...|
+---+-----------------+-----------+--------------------+
only showing top 5 rows



In [182]:
movies.schema

StructType(List(StructField(id,StringType,true),StructField(title,StringType,true),StructField(date,StringType,true),StructField(url,StringType,true)))

In [181]:
# Rename columns

(movies
 .where('date > "1996-01-01"')
 .where('id > 30')
 .show(5)
)

+---+--------------------+-----------+--------------------+
| id|               title|       date|                 url|
+---+--------------------+-----------+--------------------+
| 93|Welcome to the Do...|24-May-1996|http://us.imdb.co...|
|103|All Dogs Go to He...|29-Mar-1996|http://us.imdb.co...|
|104| Theodore Rex (1995)|29-Mar-1996|http://us.imdb.co...|
|105|   Sgt. Bilko (1996)|29-Mar-1996|http://us.imdb.co...|
|111|Truth About Cats ...|26-Apr-1996|http://us.imdb.co...|
+---+--------------------+-----------+--------------------+
only showing top 5 rows



In [174]:
(movies
 .where('date > "1996-01-01"')
 .where('id > 30')
#  .schema
 .select('date')
 .distinct()
 .count()
)

71

In [None]:
# how do I convert column 'id' to an int?

(movies_df
 .select('_c0', '_c1', '_c2', '_c4')
 .withColumnRenamed('_c0', 'id')
 .withColumnRenamed('_c1', 'title')
 .withColumnRenamed('_c2', 'date')
 .withColumnRenamed('_c4', 'url')
#  .where('date > "1996-01-01"')
 .where('id > 30')
 .where(F.col('id') < "38")
#  .schema
 .select('id', 'date')
#  .distinct()
 .show(10, False)
)

## Change data types

In [183]:
movies_df2 = (movies
 .where('id > 30')
 .select('id', 'date')
)

In [184]:
movies_df2.count()

1652

In [185]:
movies_df2.schema

StructType(List(StructField(id,StringType,true),StructField(date,StringType,true)))

In [186]:
movies_df2.show(10)

+---+-----------+
| id|       date|
+---+-----------+
| 31|01-Jan-1995|
| 32|01-Jan-1994|
| 33|01-Jan-1995|
| 34|01-Jan-1995|
| 35|01-Jan-1995|
| 36|01-Jan-1995|
| 37|01-Jan-1994|
| 38|01-Jan-1995|
| 39|01-Jan-1995|
| 40|01-Jan-1995|
+---+-----------+
only showing top 10 rows



In [187]:
movies_df2.describe().show()

+-------+-----------------+-----------+
|summary|               id|       date|
+-------+-----------------+-----------+
|  count|             1652|       1651|
|   mean|            856.5|       null|
| stddev|477.0356380816846|       null|
|    min|              100|01-Aug-1997|
|    max|              999| 4-Feb-1971|
+-------+-----------------+-----------+



### Date functions

aside: simple example from help docs

In [211]:
df = spark.createDataFrame([('2020-06-13 12:52', )], ['t'])

print(df.collect())

print(df.select('t', to_date(df.t).alias('date')).collect())

print(df.select('t', to_date(df.t, 'yyyy-MM-dd').alias('date')).collect())

[Row(t='2020-06-13 12:52')]
[Row(t='2020-06-13 12:52', date=datetime.date(2020, 6, 13))]
[Row(t='2020-06-13 12:52', date=datetime.date(2020, 6, 13))]


### convert to date, and rename

In [222]:
df.select(to_date(col('t')).alias('other')).collect()

[Row(other=datetime.date(2020, 6, 13))]

In [226]:
(df.select('t', 
          col('t').alias('first'),
          to_date(col('t')).alias('other'))
#  .collect()
#  .show()
 .schema
)

StructType(List(StructField(t,StringType,true),StructField(first,StringType,true),StructField(other,DateType,true)))

In [230]:
F.current_timestamp

Column<b'current_date()'>

In [251]:
from pyspark.sql.functions import to_date, to_timestamp, col, date_format
from pyspark.sql.functions import current_date, current_timestamp

(movies_df2
 .withColumn('new_date', to_date(col('date'), 'dd-MMM-yyyy'))
 .withColumn('new_ts', to_timestamp(col('date'), 'dd-MMM-yyyy'))
 .withColumn('new_ts', date_format(col('new_ts'), 'yyyy.MMM.dd E'))

 .withColumn('cur_date', current_date())
 .withColumn('cur_ts', date_format(current_timestamp(), 'MM-yyyy'))
 .where(col('new_date') < '1990-01-01')
#  .show()
 .printSchema()
#  .schema
)


root
 |-- id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- new_date: date (nullable = true)
 |-- new_ts: string (nullable = true)
 |-- cur_date: date (nullable = false)
 |-- cur_ts: string (nullable = false)



### order by year

In [None]:
import pyspark.sql.functions as F

In [253]:
(movies
 .withColumn('new_date', F.to_date(F.col('date'), 'dd-MMM-yyyy'))
 .withColumn('new_ts', F.to_timestamp(F.col('date'), 'dd-MMM-yyyy'))
 .where(F.col('new_date') < '1990-01-01')
 .orderBy(F.year('new_date'))
 .withColumn('year', F.year('new_date'))
 .withColumn('month', F.month('new_date'))
 .where(F.col('month') != 1)
 .filter('month > 4')
 .show())


+----+--------------------+-----------+--------------------+----------+-------------------+----+-----+
|  id|               title|       date|                 url|  new_date|             new_ts|year|month|
+----+--------------------+-----------+--------------------+----------+-------------------+----+-----+
|1198|  Purple Noon (1960)|28-Jun-1960|http://us.imdb.co...|1960-06-28|1960-06-28 00:00:00|1960|    6|
|1149|    Walkabout (1971)|20-Dec-1971|http://us.imdb.co...|1971-12-20|1971-12-20 00:00:00|1971|   12|
|1187|Switchblade Siste...|17-May-1975|http://us.imdb.co...|1975-05-17|1975-05-17 00:00:00|1975|    5|
+----+--------------------+-----------+--------------------+----------+-------------------+----+-----+



## write to csv

### With `repartition`

In [256]:
(movies
 .withColumn('year', F.year(F.to_date(F.col('date'), 'dd-MMM-yyyy')))
 .select('year')
 .distinct()
 .orderBy('year')
 .where('date != "null"')
 .repartition(1)
 .write
 .format('csv')
 .option('header', True)
 .mode('overwrite')
 .save('movie_dates.csv')
)

### with `coalesce`

In [264]:
(movies
 .withColumn('year', F.year(F.to_date(F.col('date'), 'dd-MMM-yyyy')))
 .select('year')
 .distinct()
 .orderBy('year')
 .where('date != "null"')
 .coalesce(1)
 .write
 .format('csv')
 .option('header', 'true')
 .mode('overwrite')
 .save('movie_dates_coalesce.csv')
)

### with `pandas`

In [269]:
(movies
 .withColumn('year', F.year(F.to_date(F.col('date'), 'dd-MMM-yyyy')))
 .select('year', 'date')
 .distinct()
 .orderBy('year')
 .where('date != "null"')
 .toPandas()
 .to_csv('movie_dates_pandas.csv', header=True, index=False)
)

## Aggregates

In [270]:
movies.show()

+---+--------------------+-----------+--------------------+
| id|               title|       date|                 url|
+---+--------------------+-----------+--------------------+
|  1|    Toy Story (1995)|01-Jan-1995|http://us.imdb.co...|
|  2|    GoldenEye (1995)|01-Jan-1995|http://us.imdb.co...|
|  3|   Four Rooms (1995)|01-Jan-1995|http://us.imdb.co...|
|  4|   Get Shorty (1995)|01-Jan-1995|http://us.imdb.co...|
|  5|      Copycat (1995)|01-Jan-1995|http://us.imdb.co...|
|  6|Shanghai Triad (Y...|01-Jan-1995|http://us.imdb.co...|
|  7|Twelve Monkeys (1...|01-Jan-1995|http://us.imdb.co...|
|  8|         Babe (1995)|01-Jan-1995|http://us.imdb.co...|
|  9|Dead Man Walking ...|01-Jan-1995|http://us.imdb.co...|
| 10|  Richard III (1995)|22-Jan-1996|http://us.imdb.co...|
| 11|Seven (Se7en) (1995)|01-Jan-1995|http://us.imdb.co...|
| 12|Usual Suspects, T...|14-Aug-1995|http://us.imdb.co...|
| 13|Mighty Aphrodite ...|30-Oct-1995|http://us.imdb.co...|
| 14|  Postino, Il (1994)|01-Jan-1994|ht

In [274]:
movies_df = (spark.read
             .csv(movie_fname, header=False, sep='|')
             .select('_c0', '_c1', '_c2', '_c4')
             .withColumnRenamed('_c0', 'id')
             .withColumnRenamed('_c1', 'title')
             .withColumnRenamed('_c2', 'date')
             .withColumnRenamed('_c4', 'url')
             .where('id > 30')
             .select('id', 'date', 'title', 'url')
)

print(f'{movies_df.count()} rows')

movies_df.show()

1652 rows
+---+-----------+--------------------+--------------------+
| id|       date|               title|                 url|
+---+-----------+--------------------+--------------------+
| 31|01-Jan-1995| Crimson Tide (1995)|http://us.imdb.co...|
| 32|01-Jan-1994|        Crumb (1994)|http://us.imdb.co...|
| 33|01-Jan-1995|    Desperado (1995)|http://us.imdb.co...|
| 34|01-Jan-1995|Doom Generation, ...|http://us.imdb.co...|
| 35|01-Jan-1995|Free Willy 2: The...|http://us.imdb.co...|
| 36|01-Jan-1995|     Mad Love (1995)|http://us.imdb.co...|
| 37|01-Jan-1994|        Nadja (1994)|http://us.imdb.co...|
| 38|01-Jan-1995|     Net, The (1995)|http://us.imdb.co...|
| 39|01-Jan-1995| Strange Days (1995)|http://us.imdb.co...|
| 40|01-Jan-1995|To Wong Foo, Than...|http://us.imdb.co...|
| 41|01-Jan-1995|Billy Madison (1995)|http://us.imdb.co...|
| 42|01-Jan-1994|       Clerks (1994)|http://us.imdb.co...|
| 43|01-Jan-1994|   Disclosure (1994)|http://us.imdb.co...|
| 44|01-Jan-1994|Dolores Claib

In [275]:
movies_df.toPandas().head(3)

Unnamed: 0,id,date,title,url
0,31,01-Jan-1995,Crimson Tide (1995),http://us.imdb.com/M/title-exact?Crimson%20Tid...
1,32,01-Jan-1994,Crumb (1994),http://us.imdb.com/M/title-exact?Crumb%20(1994)
2,33,01-Jan-1995,Desperado (1995),http://us.imdb.com/M/title-exact?Desperado%20(...


### String replacement using `regexp_repl`

In [289]:
from pyspark.sql.functions import split, regexp_replace

(movies_df
 .withColumn('tit2', split('url', 'title-exact\?')[1])
 .withColumn('tit2', regexp_replace('tit2', '%20', ' '))
 .show())

+---+-----------+--------------------+--------------------+--------------------+
| id|       date|               title|                 url|                tit2|
+---+-----------+--------------------+--------------------+--------------------+
| 31|01-Jan-1995| Crimson Tide (1995)|http://us.imdb.co...| Crimson Tide (1995)|
| 32|01-Jan-1994|        Crumb (1994)|http://us.imdb.co...|        Crumb (1994)|
| 33|01-Jan-1995|    Desperado (1995)|http://us.imdb.co...|    Desperado (1995)|
| 34|01-Jan-1995|Doom Generation, ...|http://us.imdb.co...|Doom Generation, ...|
| 35|01-Jan-1995|Free Willy 2: The...|http://us.imdb.co...|Free Willy 2: The...|
| 36|01-Jan-1995|     Mad Love (1995)|http://us.imdb.co...|     Mad Love (1995)|
| 37|01-Jan-1994|        Nadja (1994)|http://us.imdb.co...|        Nadja (1994)|
| 38|01-Jan-1995|     Net, The (1995)|http://us.imdb.co...|     Net, The (1995)|
| 39|01-Jan-1995| Strange Days (1995)|http://us.imdb.co...| Strange Days (1995)|
| 40|01-Jan-1995|To Wong Foo

In [293]:
from pyspark.sql.functions import year

In [304]:
(movies_df
 .where(col('date').isNotNull())
 .select('date')
 .withColumn('dt', to_date('date', 'dd-MMM-yyyy'))
 .withColumn('yr', year(to_date(col('date'), 'dd-MMM-yyyy')))
 .groupBy('yr')
 .count()
 .orderBy('count', ascending=False)
 .show(truncate=False))

+----+-----+
|yr  |count|
+----+-----+
|1996|347  |
|1997|286  |
|1994|213  |
|1995|199  |
|1993|126  |
|1998|65   |
|1992|37   |
|1990|24   |
|1991|22   |
|1989|15   |
|1986|15   |
|1987|13   |
|1982|13   |
|1981|12   |
|1988|11   |
|1979|9    |
|1958|9    |
|1984|8    |
|1940|8    |
|1957|8    |
+----+-----+
only showing top 20 rows



In [321]:
tmp = (movies_df
 .where(col('date').isNotNull())
 .select('date')
 .withColumn('dt', to_date('date', 'dd-MMM-yyyy'))
 .withColumn('yr', year(to_date(col('date'), 'dd-MMM-yyyy')))
 .groupBy('yr')
 .count()
 .orderBy('count', ascending=False)
#  .select(F.sum('count'), F.avg('count'), F.stddev('count'), F.min('count'), F.max('count'))
#  .show(truncate=False)
)
tmp.show()

+----+-----+
|  yr|count|
+----+-----+
|1996|  347|
|1997|  286|
|1994|  213|
|1995|  199|
|1993|  126|
|1998|   65|
|1992|   37|
|1990|   24|
|1991|   22|
|1989|   15|
|1986|   15|
|1987|   13|
|1982|   13|
|1981|   12|
|1988|   11|
|1979|    9|
|1958|    9|
|1984|    8|
|1940|    8|
|1957|    8|
+----+-----+
only showing top 20 rows



In [290]:
(movies_df
 .where(F.col('date').isNotNull())
 .withColumn('new_date', F.to_date('date', 'dd-MMM-yyyy'))
 .withColumn('year', F.year(F.to_date(F.col('date'), 'dd-MMM-yyyy')))
 .groupBy('year')
 .count()
 .orderBy('count', ascending=False)
 .select(F.sum('count'), F.avg('count'), F.stddev('count'), F.min('count'), F.max('count'))
 .show())

+----------+------------------+------------------+----------+----------+
|sum(count)|        avg(count)|stddev_samp(count)|min(count)|max(count)|
+----------+------------------+------------------+----------+----------+
|      1651|23.253521126760564| 62.53769567454248|         1|       347|
+----------+------------------+------------------+----------+----------+



In [None]:
(movies_df
 .where(F.col('date').isNotNull())
 .withColumn('new_date', F.to_date('date', 'dd-MMM-yyyy'))
 .withColumn('year', F.year(F.to_date(F.col('date'), 'dd-MMM-yyyy')))
 .groupBy('year')
 .count()
 .orderBy('count', ascending=False)
 .select(F.sum('count'), F.avg('count'), F.stddev('count'), F.min('count'), F.max('count'))
 .printSchema())

# Datasets API

In [None]:
from pyspark.sql import Row
row = Row(350, True, 'Learning Spark 2E', None)

In [None]:
row

In [None]:
from pyspark.sql import Row
row = Row(350, True, "Learning Spark 2E", None)

In [None]:
row[0]

In [None]:
row[1]

In [None]:
[r for r in row]

In [None]:
type(row)