# Create DataFrame with schema

In [1]:
# In Python 
from pyspark.sql.types import *
from pyspark.sql import SparkSession
# define schema for our data using DDL 
schema = "`Id` INT,`First` STRING,`Last` STRING,`Url` STRING,`Published` STRING,`Hits` INT,`Campaigns` ARRAY<STRING>"
# create our static data
data = [
    [1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
    [2, "Brooke","Wenig","https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
    [3, "Denny", "Lee", "https://tinyurl.3","6/7/2019",7659, ["web", "twitter", "FB", "LinkedIn"]],
    [4, "Tathagata", "Das","https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
    [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
    [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]
    ]


In [2]:
# create a SparkSession
spark = (SparkSession
    .builder
    .appName("Example-3_6")
    .getOrCreate())

Learning Spark, 2nd ed ch 3

In [3]:
# create a DataFrame using the schema defined above
blogs_df = spark.createDataFrame(data, schema)


In [4]:
blogs_df.schema

StructType(List(StructField(Id,IntegerType,true),StructField(First,StringType,true),StructField(Last,StringType,true),StructField(Url,StringType,true),StructField(Published,StringType,true),StructField(Hits,IntegerType,true),StructField(Campaigns,ArrayType(StringType,true),true)))

In [10]:
import pyspark.sql.types as pst

In [23]:
scm = StructType(
    [StructField('Id', IntegerType(), True),
         StructField('First', StringType(), True),
         StructField('Last', StringType(), True),
         StructField('Url', StringType(), True),
         StructField('Published', StringType(), True),
         StructField('Hits', IntegerType(), True),
         StructField('Campaigns', ArrayType(StringType(), True), True)])

In [25]:
blogs2_df = spark.createDataFrame(data, scm)
blogs2_df.show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [28]:
blogs2_df.columns

['Id', 'First', 'Last', 'Url', 'Published', 'Hits', 'Campaigns']

In [31]:
blogs2_df.selectExpr('Hits * 2 as double','Hits').show()

+------+-----+
|double| Hits|
+------+-----+
|  9070| 4535|
| 17816| 8908|
| 15318| 7659|
| 21136|10568|
| 81156|40578|
| 51136|25568|
+------+-----+



In [33]:
import pyspark.sql.functions as F

In [34]:
blogs2_df.select(F.col('Hits') * 2).show()

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
|     15318|
|     21136|
|     81156|
|     51136|
+----------+



In [36]:
blogs2_df.withColumn('Big Hitters', (F.expr('Hits > 10000'))).show()

+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|Big Hitters|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|      false|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|      false|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|      false|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|       true|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|       true|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|       true|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+



In [44]:
(blogs2_df
 .withColumn('AuthorsId', 
             (F.concat(F.expr('First'),
                       F.expr('Last'), 
                       F.expr('Id'))))
 .select('AuthorsId')
 .show(n=4))

+-------------+
|    AuthorsId|
+-------------+
|  JulesDamji1|
| BrookeWenig2|
|    DennyLee3|
|TathagataDas4|
+-------------+
only showing top 4 rows



## 4 ways to do the same thing

In [49]:
blogs2_df.select('Hits').show(2)
blogs2_df.select(F.expr('Hits')).show(2)
print('"col" is short for "column"')
blogs2_df.select(F.col('Hits')).show(2)
blogs2_df.select(F.column('Hits')).show(2)

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows

"col" is short for "column"
+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows



## Sort by `Id`

In [56]:
blogs2_df.sort(F.col('Id').desc()).show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



Hmm. `$` doesn't work to convert to a column

In [63]:
blogs_df.sort($'Id').show()

SyntaxError: invalid syntax (<ipython-input-63-63099fbf8b4a>, line 1)

# Rows

## Instantiate a row

In [64]:
# In Python
from pyspark.sql import Row

blog_row = Row(6, "Reynold", "Xin", "https://tinyurl.6", 255568, 
               "3/2/2015", ["twitter", "LinkedIn"])

In [67]:
# access using index for individual items

blog_row[1]

'Reynold'

## Row objects can be used to create DataFrames if you need them for quick interactivity and exploration. 

In [70]:
# In Python 
from pyspark.sql import Row
from pyspark.sql.types import *

In [68]:
# using DDL String to define a schema
schema = "`Author` STRING, `State` STRING"
rows = [Row("Matei Zaharia", "CA"), Row("Reynold Xin", "CA")]

In [71]:
authors_df = spark.createDataFrame(rows, schema)
authors_df.show()

+-------------+-----+
|       Author|State|
+-------------+-----+
|Matei Zaharia|   CA|
|  Reynold Xin|   CA|
+-------------+-----+



## drop a column

In [215]:
authors_df.drop('State').show()

+-------------+
|       Author|
+-------------+
|Matei Zaharia|
|  Reynold Xin|
+-------------+



# Common DataFrame Operations

In [76]:
people_file = '/Users/bartev/dev/spark-3.0.0-preview2-bin-hadoop2.7/examples/src/main/resources/people.csv'

In [74]:
from pyspark.sql.types import *

## Programmatic way to define a schema

In [75]:
people_schema = StructType([StructField('name', StringType(), True),
                           StructField('age', IntegerType(), True),
                           StructField('job', StringType(), True)])

## read the file using DataFrameReader using format csv

In [79]:
people_df = spark.read.csv(people_file, header=True, schema=people_schema, sep=';')

In [80]:
people_df.show()

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
+-----+---+---------+



In [92]:
people_tbl = people_df.write.format('parquet').save('people.parquet')

In [85]:
spark.read.parquet('people.parquet')

StructType(List(StructField(name,StringType,true),StructField(age,IntegerType,true),StructField(job,StringType,true)))

In [93]:
parquet_table = 'people_tbl'
(people_df.write
    .format('parquet')
    .saveAsTable(parquet_table))

## Projections and filters

In [94]:
people_df = spark.read.csv(people_file, header=True, schema=people_schema, sep=';')

In [97]:
(people_df.select('age')
     .where('age > 30')
    .show())

+---+
|age|
+---+
| 32|
+---+



In [98]:
movie_fname = '/Users/bartev/dev/github-bv/san-tan/lrn-spark/Data-ML-100k--master/ml-100k/u.item'

In [105]:
movies_df = spark.read.csv(movie_fname, header=False, sep='|')

In [132]:
# Rename columns

(movies_df
 .select('_c0', '_c1', '_c2', '_c4')
 .withColumnRenamed('_c0', 'id')
 .withColumnRenamed('_c1', 'title')
 .withColumnRenamed('_c2', 'date')
 .withColumnRenamed('_c4', 'url')
 .where('date > "1996-01-01"')
 .where('id > 30')
#  .schema
 .show(5)
)

+---+--------------------+-----------+--------------------+
| id|               title|       date|                 url|
+---+--------------------+-----------+--------------------+
| 93|Welcome to the Do...|24-May-1996|http://us.imdb.co...|
|103|All Dogs Go to He...|29-Mar-1996|http://us.imdb.co...|
|104| Theodore Rex (1995)|29-Mar-1996|http://us.imdb.co...|
|105|   Sgt. Bilko (1996)|29-Mar-1996|http://us.imdb.co...|
|111|Truth About Cats ...|26-Apr-1996|http://us.imdb.co...|
+---+--------------------+-----------+--------------------+
only showing top 5 rows



In [127]:
(movies_df
 .select('_c0', '_c1', '_c2', '_c4')
 .withColumnRenamed('_c0', 'id')
 .withColumnRenamed('_c1', 'title')
 .withColumnRenamed('_c2', 'date')
 .withColumnRenamed('_c4', 'url')
 .where('date > "1996-01-01"')
 .where('id > 30')
#  .schema
 .select('date')
 .distinct()
 .count()
)

71

In [143]:
# how do I convert column 'id' to an int?

(movies_df
 .select('_c0', '_c1', '_c2', '_c4')
 .withColumnRenamed('_c0', 'id')
 .withColumnRenamed('_c1', 'title')
 .withColumnRenamed('_c2', 'date')
 .withColumnRenamed('_c4', 'url')
#  .where('date > "1996-01-01"')
 .where('id > 30')
 .where(F.col('id') < "38")
#  .schema
 .select('id', 'date')
#  .distinct()
 .show(10, False)
)

+---+-----------+
|id |date       |
+---+-----------+
|31 |01-Jan-1995|
|32 |01-Jan-1994|
|33 |01-Jan-1995|
|34 |01-Jan-1995|
|35 |01-Jan-1995|
|36 |01-Jan-1995|
|37 |01-Jan-1994|
|100|14-Feb-1997|
|101|08-Mar-1981|
|102|01-Jan-1970|
+---+-----------+
only showing top 10 rows



## Change data types

In [153]:
movies_df2 = (movies_df
 .select('_c0', '_c1', '_c2', '_c4')
 .withColumnRenamed('_c0', 'id')
 .withColumnRenamed('_c1', 'title')
 .withColumnRenamed('_c2', 'date')
 .withColumnRenamed('_c4', 'url')
 .where('id > 30')
 .select('id', 'date')
)

In [145]:
movies_df2.count()

1652

In [146]:
movies_df2.schema

StructType(List(StructField(id,StringType,true),StructField(date,StringType,true)))

In [147]:
movies_df2.show(10)

+---+-----------+
| id|       date|
+---+-----------+
| 31|01-Jan-1995|
| 32|01-Jan-1994|
| 33|01-Jan-1995|
| 34|01-Jan-1995|
| 35|01-Jan-1995|
| 36|01-Jan-1995|
| 37|01-Jan-1994|
| 38|01-Jan-1995|
| 39|01-Jan-1995|
| 40|01-Jan-1995|
+---+-----------+
only showing top 10 rows



In [149]:
movies_df2.describe().show()

+-------+-----------------+-----------+
|summary|               id|       date|
+-------+-----------------+-----------+
|  count|             1652|       1651|
|   mean|            856.5|       null|
| stddev|477.0356380816846|       null|
|    min|              100|01-Aug-1997|
|    max|              999| 4-Feb-1971|
+-------+-----------------+-----------+



### Date functions

In [169]:
(movies_df2
 .withColumn('new_date', F.to_date(F.col('date'), 'dd-MMM-yyyy'))
 .withColumn('new_ts', F.to_timestamp(F.col('date'), 'dd-MMM-yyyy'))
 .where(F.col('new_date') < '1990-01-01')
 .show())


+---+-----------+----------+-------------------+
| id|       date|  new_date|             new_ts|
+---+-----------+----------+-------------------+
| 50|01-Jan-1977|1977-01-01|1977-01-01 00:00:00|
| 74|01-Jan-1965|1965-01-01|1965-01-01 00:00:00|
| 89|01-Jan-1982|1982-01-01|1982-01-01 00:00:00|
| 99|01-Jan-1937|1937-01-01|1937-01-01 00:00:00|
|101|08-Mar-1981|1981-03-08|1981-03-08 00:00:00|
|102|01-Jan-1970|1970-01-01|1970-01-01 00:00:00|
|127|01-Jan-1972|1972-01-01|1972-01-01 00:00:00|
|131|01-Jan-1961|1961-01-01|1961-01-01 00:00:00|
|132|01-Jan-1939|1939-01-01|1939-01-01 00:00:00|
|133|01-Jan-1939|1939-01-01|1939-01-01 00:00:00|
|134|01-Jan-1941|1941-01-01|1941-01-01 00:00:00|
|135|01-Jan-1968|1968-01-01|1968-01-01 00:00:00|
|136|01-Jan-1939|1939-01-01|1939-01-01 00:00:00|
|139|01-Jan-1969|1969-01-01|1969-01-01 00:00:00|
|141|01-Jan-1954|1954-01-01|1954-01-01 00:00:00|
|142|01-Jan-1971|1971-01-01|1971-01-01 00:00:00|
|143|01-Jan-1965|1965-01-01|1965-01-01 00:00:00|
|144|01-Jan-1988|198

### order by year

In [33]:
import pyspark.sql.functions as F

In [186]:
(movies_df2
 .withColumn('new_date', F.to_date(F.col('date'), 'dd-MMM-yyyy'))
 .withColumn('new_ts', F.to_timestamp(F.col('date'), 'dd-MMM-yyyy'))
 .where(F.col('new_date') < '1990-01-01')
 .orderBy(F.year('new_date'))
 .withColumn('year', F.year('new_date'))
 .withColumn('month', F.month('new_date'))
 .where(F.col('month') != 1)
 .show())


+----+-----------+----------+-------------------+----+-----+
|  id|       date|  new_date|             new_ts|year|month|
+----+-----------+----------+-------------------+----+-----+
|1198|28-Jun-1960|1960-06-28|1960-06-28 00:00:00|1960|    6|
|1149|20-Dec-1971|1971-12-20|1971-12-20 00:00:00|1971|   12|
|1373| 4-Feb-1971|1971-02-04|1971-02-04 00:00:00|1971|    2|
|1187|17-May-1975|1975-05-17|1975-05-17 00:00:00|1975|    5|
|1214|08-Mar-1976|1976-03-08|1976-03-08 00:00:00|1976|    3|
| 101|08-Mar-1981|1981-03-08|1981-03-08 00:00:00|1981|    3|
|1635|26-Apr-1986|1986-04-26|1986-04-26 00:00:00|1986|    4|
|1078|29-Mar-1988|1988-03-29|1988-03-29 00:00:00|1988|    3|
+----+-----------+----------+-------------------+----+-----+



## write to csv

### With `repartition`

In [210]:
(movies_df2
    .withColumn('year', F.year(F.to_date(F.col('date'), 'dd-MMM-yyyy')))
     .select('year')
     .distinct()
     .orderBy('year')
     .where('date != "null"')
     .repartition(1)
    .write
 .format('csv')
 .option('header', 'true')
 .save('movie_dates.csv')
)

### with `coalesce`

In [211]:
(movies_df2
    .withColumn('year', F.year(F.to_date(F.col('date'), 'dd-MMM-yyyy')))
     .select('year')
     .distinct()
     .orderBy('year')
     .where('date != "null"')
     .coalesce(1)
    .write
 .format('csv')
 .option('header', 'true')
 .save('movie_dates_coalesce.csv')
)

### with `pandas`

In [213]:
(movies_df2
    .withColumn('year', F.year(F.to_date(F.col('date'), 'dd-MMM-yyyy')))
     .select('year')
     .distinct()
     .orderBy('year')
     .where('date != "null"')
    .toPandas()
 .to_csv('movie_dates_pandas.csv', header=True, index=False)
)

## Aggregates

In [217]:
movies_df = (spark.read
             .csv(movie_fname, header=False, sep='|')
             .select('_c0', '_c1', '_c2', '_c4')
             .withColumnRenamed('_c0', 'id')
             .withColumnRenamed('_c1', 'title')
             .withColumnRenamed('_c2', 'date')
             .withColumnRenamed('_c4', 'url')
             .where('id > 30')
             .select('id', 'date', 'title', 'url')
)
movies_df.show()

+---+-----------+--------------------+--------------------+
| id|       date|               title|                 url|
+---+-----------+--------------------+--------------------+
| 31|01-Jan-1995| Crimson Tide (1995)|http://us.imdb.co...|
| 32|01-Jan-1994|        Crumb (1994)|http://us.imdb.co...|
| 33|01-Jan-1995|    Desperado (1995)|http://us.imdb.co...|
| 34|01-Jan-1995|Doom Generation, ...|http://us.imdb.co...|
| 35|01-Jan-1995|Free Willy 2: The...|http://us.imdb.co...|
| 36|01-Jan-1995|     Mad Love (1995)|http://us.imdb.co...|
| 37|01-Jan-1994|        Nadja (1994)|http://us.imdb.co...|
| 38|01-Jan-1995|     Net, The (1995)|http://us.imdb.co...|
| 39|01-Jan-1995| Strange Days (1995)|http://us.imdb.co...|
| 40|01-Jan-1995|To Wong Foo, Than...|http://us.imdb.co...|
| 41|01-Jan-1995|Billy Madison (1995)|http://us.imdb.co...|
| 42|01-Jan-1994|       Clerks (1994)|http://us.imdb.co...|
| 43|01-Jan-1994|   Disclosure (1994)|http://us.imdb.co...|
| 44|01-Jan-1994|Dolores Claiborne...|ht

In [240]:
(movies_df
 .where(F.col('date').isNotNull())
 .withColumn('new_date', F.to_date('date', 'dd-MMM-yyyy'))
 .withColumn('year', F.year(F.to_date(F.col('date'), 'dd-MMM-yyyy')))
 .groupBy('year')
 .count()
 .orderBy('count', ascending=False)
 .select(F.sum('count'), F.avg('count'), F.stddev('count'), F.min('count'), F.max('count'))
 .show())

+----------+------------------+------------------+----------+----------+
|sum(count)|        avg(count)|stddev_samp(count)|min(count)|max(count)|
+----------+------------------+------------------+----------+----------+
|      1651|23.253521126760564| 62.53769567454248|         1|       347|
+----------+------------------+------------------+----------+----------+



In [241]:
(movies_df
 .where(F.col('date').isNotNull())
 .withColumn('new_date', F.to_date('date', 'dd-MMM-yyyy'))
 .withColumn('year', F.year(F.to_date(F.col('date'), 'dd-MMM-yyyy')))
 .groupBy('year')
 .count()
 .orderBy('count', ascending=False)
 .select(F.sum('count'), F.avg('count'), F.stddev('count'), F.min('count'), F.max('count'))
 .printSchema())

root
 |-- sum(count): long (nullable = true)
 |-- avg(count): double (nullable = true)
 |-- stddev_samp(count): double (nullable = true)
 |-- min(count): long (nullable = true)
 |-- max(count): long (nullable = true)

