In [9]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
import pyspark
from pyspark.sql.types import *
from pyspark.sql.functions import avg
from pyspark.sql import Row
from pyspark.sql.functions import col, lit

In [2]:
spark = SparkSession.builder\
        .master("local[*]")\
        .appName('Chapter3')\
        .getOrCreate()

sc = spark.sparkContext

In [None]:
# Create an RDD of tuples ( name, age)
dataRDD =sc.parallelize([('Brooke', 20), ('Denny', 31), ("Jules", 30),
                        ('TD', 35), ('Brooke', 25)])

# Use map and reduceByKey transformations with their lambda
# expressions to aggregate and then compute average

agesRRDD = (dataRDD
           .map(lambda x : (x[0], (x[1],1)))
           .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
           .map(lambda x : (x[1][0]/x[1][1])))

In [None]:
from pyspark.sql.functions import avg

#Create a DataFrame
data_df =spark.createDataFrame([('Brooke', 20), ('Denny', 31), ("Jules", 30),
                        ('TD', 35), ('Brooke', 25)], ['name', 'age'])

# Group the same names together, aggregate their ages, and compute an average
avg_df = data_df.groupBy('name').agg(avg('age'))
# Show the result of the final execution
avg_df.show()

In [None]:
from pyspark.sql.types import *
schema = StructType([StructField('author', StringType(), False),
                    StructField('title', StringType(), False),
                    StructField('pages', StringType(), False)])

In [None]:
# DDL schema
schema = 'author STRING, title STRING, pages INT'

In [None]:
# Define schema for our data using DDL
schema = "`Id` INT,`First` STRING, `Last` STRING, `Url` STRING, `Published` STRING, `Hits` INT, `Campaings` ARRAY<STRING>"

#Create our static data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter","LinkedIn"]],
        [2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter","LinkedIn"]],
        [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web", "twitter", "FB", "LinkedIn"]],
        [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568,["twitter", "FB"]],
        [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
        [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568,["twitter", "LinkedIn"]]
       ]

# Main program
if __name__ == '__main__':
    # Create a SparkSession
    spark =   (SparkSession
              .builder
              .appName('Example-3.6')
              .getOrCreate())
    #Create a DataFrame using the schema defined above
    blogs_df = spark.createDataFrame(data, schema)
    # Show the DataFrame: it should reflect our table above
    blogs_df.show()
    # Print the schema used by Spark to process the DF
    print(blogs_df.printSchema())
    

In [None]:
blogs_df.schema

In [None]:
from pyspark.sql import Row
blog_row = Row(6, 'Reynold', 'Xin', 'https://tinyurl.6', 255568, '3/2/2015',
              ['twitter', 'Linkedin'])
# Access using index for individual items
blog_row[1]

In [None]:
rows = [Row('Matel Zaharta', 'CA'), Row('Reynold Xin', 'CA')]
authors_df = spark.createDataFrame(rows, ['Authors', 'State'])

In [None]:
authors_df.show()

In [4]:
# In Python, define a schema 
from pyspark.sql.types import *
# Programmatic way to define a schema 
fire_schema = StructType([StructField('CallNumber', IntegerType(), True),
                 StructField('UnitID', StringType(), True),
                 StructField('IncidentNumber', IntegerType(), True),
                 StructField('CallType', StringType(), True), 
                 StructField('CallDate', StringType(), True), 
                 StructField('WatchDate', StringType(), True),
                 StructField('CallFinalDisposition', StringType(), True),
                 StructField('AvailableDtTm', StringType(), True),
                 StructField('Address', StringType(), True), 
                 StructField('City', StringType(), True), 
                 StructField('Zipcode', IntegerType(), True), 
                 StructField('Battalion', StringType(), True), 
                 StructField('StationArea', StringType(), True), 
                 StructField('Box', StringType(), True), 
                 StructField('OriginalPriority', StringType(), True), 
                 StructField('Priority', StringType(), True), 
                 StructField('FinalPriority', IntegerType(), True), 
                 StructField('ALSUnit', BooleanType(), True), 
                 StructField('CallTypeGroup', StringType(), True),
                 StructField('NumAlarms', IntegerType(), True),
                 StructField('UnitType', StringType(), True),
                 StructField('UnitSequenceInCallDispatch', IntegerType(), True),
                 StructField('FirePreventionDistrict', StringType(), True),
                 StructField('SupervisorDistrict', StringType(), True),
                 StructField('Neighborhood', StringType(), True),
                 StructField('Location', StringType(), True),
                 StructField('RowID', StringType(), True),
                 StructField('Delay', FloatType(), True)])

# Use the DataFrameReader interface to read a CSV file
sf_fire_file = "data/sf-fire-calls.csv"
fire_df = spark.read.csv(sf_fire_file, header=True, schema=fire_schema)
# To save as a Parquet file
#parquet_path = 'data/parquet/firep'
#fire_df.write(fire_df, parquet_path)


In [None]:
# To save a table 
# parquet_table = 'Fire_table'
# fire_df.write.format('parquet').saveAsTable(parquet_table)

In [10]:
few_fire_df = (fire_df
              .select('IncidentNumber', 'AvailableDtTm', 'CallType')
              .where(col('CallType') != 'Medical Incident'))
few_fire_df.show(5, truncate=False)

+--------------+----------------------+--------------+
|IncidentNumber|AvailableDtTm         |CallType      |
+--------------+----------------------+--------------+
|2003235       |01/11/2002 01:51:44 AM|Structure Fire|
|2003250       |01/11/2002 04:16:46 AM|Vehicle Fire  |
|2003259       |01/11/2002 06:01:58 AM|Alarms        |
|2003279       |01/11/2002 08:03:26 AM|Structure Fire|
|2003301       |01/11/2002 09:46:44 AM|Alarms        |
+--------------+----------------------+--------------+
only showing top 5 rows



In [12]:
from pyspark.sql.functions import *
call_fire= (fire_df
            .select('CallType')
            .where(col('CallType').isNotNull())
            .agg(countDistinct('CallType').alias('DistinctCallTypes'))
           )
call_fire.show()

+-----------------+
|DistinctCallTypes|
+-----------------+
|               30|
+-----------------+



In [13]:
fire_call = (fire_df
            .select('CallType')
            .where(col('CallType').isNotNull())
            .distinct())
fire_call.show(10, False)

+-----------------------------+
|CallType                     |
+-----------------------------+
|Elevator / Escalator Rescue  |
|Marine Fire                  |
|Aircraft Emergency           |
|Administrative               |
|Alarms                       |
|Odor (Strange / Unknown)     |
|Citizen Assist / Service Call|
|HazMat                       |
|Watercraft in Distress       |
|Explosion                    |
+-----------------------------+
only showing top 10 rows



In [16]:
new_fire_df = fire_df.withColumnRenamed('Delay', 'ResponseDelayedinMins')
(new_fire_df
.select('ResponseDelayedinMins')
.where(col('ResponseDelayedinMins') > 5)
.show(5, False))

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|5.35                 |
|6.25                 |
|5.2                  |
|5.6                  |
|7.25                 |
+---------------------+
only showing top 5 rows



In [18]:
fire_ts_df = (new_fire_df
             .withColumn('IncidentDate', to_timestamp(col('CallDate'), 'MM/dd/yyyy'))
             .drop('CallDate')
             .withColumn('OnWatchDate', to_timestamp(col('WatchDate'), 'MM/dd/yyyy'))
             .drop('WatchDate')
             .withColumn('AvailableDtTS', to_timestamp(col('AvailableDtTm'), 'MM/dd/yyyy hh:mm:ss a'))
             .drop('AvailableDtTm'))

# Select the converted columns
(fire_ts_df
 .select("IncidentDate", "OnWatchDate", "AvailableDtTS")
 .show(5, False))

+-------------------+-------------------+-------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS      |
+-------------------+-------------------+-------------------+
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|
+-------------------+-------------------+-------------------+
only showing top 5 rows



In [19]:
(fire_ts_df
.select(year('IncidentDate'))
.distinct()
.orderBy(year('IncidentDate'))
.show())

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|              2005|
|              2006|
|              2007|
|              2008|
|              2009|
|              2010|
|              2011|
|              2012|
|              2013|
|              2014|
|              2015|
|              2016|
|              2017|
|              2018|
+------------------+



In [20]:
(fire_ts_df
 .select("CallType")
 .where(col("CallType").isNotNull())
 .groupBy("CallType")
 .count()
 .orderBy("count", ascending=False)
 .show(n=10, truncate=False))

+-------------------------------+------+
|CallType                       |count |
+-------------------------------+------+
|Medical Incident               |113794|
|Structure Fire                 |23319 |
|Alarms                         |19406 |
|Traffic Collision              |7013  |
|Citizen Assist / Service Call  |2524  |
|Other                          |2166  |
|Outside Fire                   |2094  |
|Vehicle Fire                   |854   |
|Gas Leak (Natural and LP Gases)|764   |
|Water Rescue                   |755   |
+-------------------------------+------+
only showing top 10 rows



In [21]:
import pyspark.sql.functions as F
(fire_ts_df
 .select(F.sum("NumAlarms"), F.avg("ResponseDelayedinMins"),F.min("ResponseDelayedinMins"), F.max("ResponseDelayedinMins"))
 .show())

+--------------+--------------------------+--------------------------+--------------------------+
|sum(NumAlarms)|avg(ResponseDelayedinMins)|min(ResponseDelayedinMins)|max(ResponseDelayedinMins)|
+--------------+--------------------------+--------------------------+--------------------------+
|        176170|         3.892364154521585|               0.016666668|                   1844.55|
+--------------+--------------------------+--------------------------+--------------------------+



In [22]:
from pyspark.sql import Row
row = Row(350, True, 'Learning Spark 2E', None)


In [23]:
row[0]

350

In [24]:
row[1]

True

In [25]:
row[2]

'Learning Spark 2E'