# SparkR Sample - USA Daily Temperatures

In [1]:
Sys.getenv("SPARK_HOME")

In [2]:
# Set the correct value for SPARK_HOME if not set in your environment
if (nchar(Sys.getenv("SPARK_HOME")) < 1) {
  Sys.setenv(SPARK_HOME = "/Users/skalathur/MyApps/spark")
}

In [3]:
Sys.setenv(SPARK_LOCAL_IP="localhost")

In [4]:
# load the SparkR library (wait until it loads)
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))


Attaching package: ‘SparkR’

The following objects are masked from ‘package:stats’:

    cov, filter, lag, na.omit, predict, sd, var, window

The following objects are masked from ‘package:base’:

    as.data.frame, colnames, colnames<-, drop, endsWith, intersect,
    rank, rbind, sample, startsWith, subset, summary, transform, union



In [5]:
# Start the Spark Session, wait until it starts
sparkR.session(master = "local[*]", sparkConfig = list(spark.driver.memory = "2g"))

Spark package found in SPARK_HOME: /Users/skalathur/MyApps/spark


Launching java with spark-submit command /Users/skalathur/MyApps/spark/bin/spark-submit   --driver-memory "2g" sparkr-shell /var/folders/s3/hy6_p79n3w1fw802t6ps40qr0000gp/T//Rtmpi1F4En/backend_port143353f517b08 


Java ref type org.apache.spark.sql.SparkSession id 1 

In [6]:
inputFile <- "/temp/datasets/usa_daily_avg_temps.csv"

In [7]:
# Read the csv file as a SparkDataFrame
usaDailyTemps <- read.df(inputFile, source = "csv", 
                         header='true', 
                         inferSchema='true')

usaDailyTemps

SparkDataFrame[state:string, city:string, month:int, day:int, year:int, avgtemp:double]

In [8]:
printSchema(usaDailyTemps)

root
 |-- state: string (nullable = true)
 |-- city: string (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- avgtemp: double (nullable = true)


In [9]:
count(usaDailyTemps)

In [10]:
head(usaDailyTemps)

state,city,month,day,year,avgtemp
Alabama,Birmingham,1,1,1995,50.7
Alabama,Birmingham,1,1,1996,56.8
Alabama,Birmingham,1,1,1997,60.9
Alabama,Birmingham,1,1,1998,35.6
Alabama,Birmingham,1,1,1999,41.0
Alabama,Birmingham,1,1,2000,59.0


### Aggregate to find the maximum of avgtemp

In [11]:
maxAvgTemp <- summarize(usaDailyTemps, max(usaDailyTemps$avgtemp))
maxAvgTemp

SparkDataFrame[max(avgtemp):double]

In [12]:
count(maxAvgTemp)

In [13]:
# collect to local data frame
collect(maxAvgTemp)

max(avgtemp)
107.5


In [14]:
# Provide the appropriate column name (MaxValue)
maxAvgTemp <- summarize(usaDailyTemps, MaxValue = max(usaDailyTemps$avgtemp))
maxAvgTemp

SparkDataFrame[MaxValue:double]

In [15]:
localDf <- collect(maxAvgTemp)
localDf

MaxValue
107.5


In [16]:
# Filter the SparkDataFrame to find the rows with the max value
maxData <- filter(usaDailyTemps, usaDailyTemps$avgtemp == localDf[1, 'MaxValue'])
maxData

SparkDataFrame[state:string, city:string, month:int, day:int, year:int, avgtemp:double]

In [17]:
# collect to local data frame
collect(maxData)

state,city,month,day,year,avgtemp
Arizona,Yuma,7,22,2006,107.5


### Aggregate to find the maximum of avgtemp grouping by Year

In [18]:
maxTempByYear <- summarize(groupBy(usaDailyTemps, usaDailyTemps$Year), 
                           MaxValue = max(usaDailyTemps$avgtemp))
maxTempByYear

SparkDataFrame[Year:int, MaxValue:double]

In [19]:
count(maxTempByYear)

In [20]:
collect(maxTempByYear)

Year,MaxValue
2003,105.8
2007,104.4
2015,105.1
2006,107.5
2013,104.9
1997,100.6
2014,103.8
2004,101.0
1996,104.3
1998,103.0


In [21]:
arrange(maxTempByYear, maxTempByYear$Year)

SparkDataFrame[Year:int, MaxValue:double]

In [22]:
collect(arrange(maxTempByYear, maxTempByYear$Year))

Year,MaxValue
1995,104.3
1996,104.3
1997,100.6
1998,103.0
1999,100.1
2000,101.6
2001,104.4
2002,102.6
2003,105.8
2004,101.0


### Aggregate to find the maximum of avgtemp grouping by State

In [23]:
maxTempByState <- summarize(groupBy(usaDailyTemps, usaDailyTemps$State), 
                           MaxValue = max(usaDailyTemps$avgtemp))
maxTempByState

SparkDataFrame[State:string, MaxValue:double]

In [24]:
count(maxTempByState)

In [25]:
collect(maxTempByState)

State,MaxValue
Utah,92.2
Hawaii,87.2
Minnesota,92.0
Ohio,91.2
Arkansas,100.7
Oregon,97.3
Texas,98.5
North Dakota,91.7
Pennsylvania,92.9
Connecticut,89.8


In [26]:
arrange(maxTempByState, maxTempByState$State)

SparkDataFrame[State:string, MaxValue:double]

In [27]:
collect(arrange(maxTempByState, maxTempByState$State))

State,MaxValue
Alabama,91.5
Alaska,79.5
Arizona,107.5
Arkansas,100.7
California,102.6
Colorado,94.7
Connecticut,89.8
Delaware,89.7
Florida,92.8
Georgia,97.7


### Aggregate to find the number of entries grouping by State

In [28]:
stateCounts <- summarize(groupBy(usaDailyTemps, usaDailyTemps$state), 
                         count = n(usaDailyTemps$state))
stateCounts

SparkDataFrame[state:string, count:bigint]

In [29]:
collect(arrange(stateCounts, desc(stateCounts$count)))

state,count
Texas,106736
Ohio,53368
Florida,51495
Pennsylvania,43871
Michigan,38120
California,38120
New York,38120
Oregon,30496
Illinois,30496
Georgia,30496


In [None]:
### Aggregate to find the number of entries grouping by State and City

In [30]:
stateCityCounts <- summarize(groupBy(usaDailyTemps, usaDailyTemps$state, usaDailyTemps$city), 
                             count = n(usaDailyTemps$state))
stateCityCounts

SparkDataFrame[state:string, city:string, count:bigint]

In [31]:
collect(arrange(stateCityCounts, asc(stateCityCounts$state)))

state,city,count
Alabama,Huntsville,7624
Alabama,Birmingham,7624
Alabama,Montgomery,7624
Alabama,Mobile,7624
Alaska,Fairbanks,7624
Alaska,Anchorage,7624
Alaska,Juneau,7624
Arizona,Tucson,7624
Arizona,Phoenix,7624
Arizona,Yuma,4380


In [32]:
collect(arrange(stateCityCounts, asc(stateCityCounts$state),
                asc(stateCityCounts$city)))

state,city,count
Alabama,Birmingham,7624
Alabama,Huntsville,7624
Alabama,Mobile,7624
Alabama,Montgomery,7624
Alaska,Anchorage,7624
Alaska,Fairbanks,7624
Alaska,Juneau,7624
Arizona,Flagstaff,3574
Arizona,Phoenix,7624
Arizona,Tucson,7624


### Number of cities for each state in the dataset

In [33]:
collect(summarize(groupBy(stateCityCounts, stateCityCounts$state), 
                  count = n(stateCityCounts$state)))

state,count
Utah,1
Hawaii,1
Minnesota,2
Ohio,7
Oregon,4
Arkansas,2
Texas,14
North Dakota,2
Pennsylvania,6
Connecticut,2


### Create a subset SparkDataFrame for Boston

In [34]:
bostonDailyTemps <- subset(usaDailyTemps, usaDailyTemps$city == 'Boston')
bostonDailyTemps

SparkDataFrame[state:string, city:string, month:int, day:int, year:int, avgtemp:double]

In [35]:
count(bostonDailyTemps)

In [36]:
bostonAvgTempsByYear <- summarize(groupBy(bostonDailyTemps, bostonDailyTemps$Year), 
                               Average = avg(bostonDailyTemps$avgtemp))
bostonAvgTempsByYear

SparkDataFrame[Year:int, Average:double]

In [37]:
collect(
  arrange(bostonAvgTempsByYear, bostonAvgTempsByYear$Year)
  )

Year,Average
1995,51.32027
1996,47.71749
1997,50.83863
1998,51.51562
1999,52.33945
2000,50.36148
2001,52.42822
2002,50.41205
2003,49.73014
2004,50.52514


In [38]:
bostonAvgTempsByMonth <- summarize(groupBy(bostonDailyTemps, bostonDailyTemps$Month), 
                                  Average = avg(bostonDailyTemps$avgtemp))
bostonAvgTempsByMonth

SparkDataFrame[Month:int, Average:double]

In [39]:
collect(
  arrange(bostonAvgTempsByMonth, bostonAvgTempsByMonth$Month)
)

Month,Average
1,29.76667
2,31.47032
3,37.57604
4,47.08413
5,57.57803
6,66.10714
7,73.55038
8,71.68909
9,65.05762
10,54.73456


In [40]:
bostonAvgTempsByYearAndMonth <- summarize(groupBy(bostonDailyTemps, bostonDailyTemps$Year, bostonDailyTemps$Month), 
                                  Average = avg(bostonDailyTemps$avgtemp))
bostonAvgTempsByYearAndMonth

SparkDataFrame[Year:int, Month:int, Average:double]

In [41]:
collect(
  arrange(bostonAvgTempsByYearAndMonth, bostonAvgTempsByYearAndMonth$Year, bostonAvgTempsByYearAndMonth$Month)
)

Year,Month,Average
1995,1,34.51935
1995,2,28.57500
1995,3,38.03871
1995,4,45.42000
1995,5,56.69677
1995,6,68.47667
1995,7,75.57419
1995,8,72.52581
1995,9,62.93667
1995,10,58.07742


In [42]:
bostonYears <- select(bostonDailyTemps, 'year')
bostonYears

SparkDataFrame[year:int]

In [43]:
distinctBostonYears <- distinct(bostonYears)
distinctBostonYears

SparkDataFrame[year:int]

In [44]:
yearsDF <- collect(distinct(bostonYears))
yearsDF

year
2003
2007
2015
2006
2013
1997
2014
2004
1996
1998


In [45]:
yearsDF[order(yearsDF$year), ]

In [46]:
# Stop the SparkSession now
sparkR.session.stop()