# SparkR Sample - USA Daily Temperatures

In [None]:
Sys.getenv("SPARK_HOME")

In [None]:
# Set the correct value for SPARK_HOME if not set in your environment
if (nchar(Sys.getenv("SPARK_HOME")) < 1) {
  Sys.setenv(SPARK_HOME = "/Users/skalathur/MyApps/spark")
}

In [None]:
Sys.setenv(SPARK_LOCAL_IP="localhost")

In [None]:
# load the SparkR library (wait until it loads)
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))

In [None]:
# Start the Spark Session, wait until it starts
sparkR.session(master = "local[*]", sparkConfig = list(spark.driver.memory = "1g"))

In [None]:
inputFile <- "/temp/datasets/usa_daily_avg_temps.csv"

In [None]:
# Read the csv file as a SparkDataFrame
usaDailyTemps <- read.df(inputFile, source = "csv", 
                         header='true', 
                         inferSchema='true')

usaDailyTemps

In [None]:
printSchema(usaDailyTemps)

In [None]:
count(usaDailyTemps)

In [None]:
head(usaDailyTemps)

### Aggregate to find the maximum of avgtemp

In [None]:
maxAvgTemp <- summarize(usaDailyTemps, max(usaDailyTemps$avgtemp))
maxAvgTemp

In [None]:
count(maxAvgTemp)

In [None]:
# collect to local data frame
collect(maxAvgTemp)

In [None]:
# Provide the appropriate column name (MaxValue)
maxAvgTemp <- summarize(usaDailyTemps, MaxValue = max(usaDailyTemps$avgtemp))
maxAvgTemp

In [None]:
localDf <- collect(maxAvgTemp)
localDf

In [None]:
# Filter the SparkDataFrame to find the rows with the max value
maxData <- filter(usaDailyTemps, usaDailyTemps$avgtemp == localDf[1, 'MaxValue'])
maxData

In [None]:
# collect to local data frame
collect(maxData)

### Aggregate to find the maximum of avgtemp grouping by Year

In [None]:
maxTempByYear <- summarize(groupBy(usaDailyTemps, usaDailyTemps$Year), 
                           MaxValue = max(usaDailyTemps$avgtemp))
maxTempByYear

In [None]:
count(maxTempByYear)

In [None]:
collect(maxTempByYear)

In [None]:
arrange(maxTempByYear, maxTempByYear$Year)

In [None]:
collect(arrange(maxTempByYear, maxTempByYear$Year))

### Aggregate to find the maximum of avgtemp grouping by State

In [None]:
maxTempByState <- summarize(groupBy(usaDailyTemps, usaDailyTemps$State), 
                           MaxValue = max(usaDailyTemps$avgtemp))
maxTempByState

In [None]:
count(maxTempByState)

In [None]:
collect(maxTempByState)

In [None]:
arrange(maxTempByState, maxTempByState$State)

In [None]:
collect(arrange(maxTempByState, maxTempByState$State))

### Aggregate to find the number of entries grouping by State

In [None]:
stateCounts <- summarize(groupBy(usaDailyTemps, usaDailyTemps$state), 
                         count = n(usaDailyTemps$state))
stateCounts

In [None]:
collect(arrange(stateCounts, desc(stateCounts$count)))

In [None]:
### Aggregate to find the number of entries grouping by State and City

In [None]:
stateCityCounts <- summarize(groupBy(usaDailyTemps, usaDailyTemps$state, usaDailyTemps$city), 
                             count = n(usaDailyTemps$state))
stateCityCounts

In [None]:
collect(arrange(stateCityCounts, asc(stateCityCounts$state)))

In [None]:
collect(arrange(stateCityCounts, asc(stateCityCounts$state),
                asc(stateCityCounts$city)))

### Number of cities for each state in the dataset

In [None]:
collect(summarize(groupBy(stateCityCounts, stateCityCounts$state), 
                  count = n(stateCityCounts$state)))

### Create a subset SparkDataFrame for Boston

In [None]:
bostonDailyTemps <- subset(usaDailyTemps, usaDailyTemps$city == 'Boston')
bostonDailyTemps

In [None]:
count(bostonDailyTemps)

In [None]:
bostonAvgTempsByYear <- summarize(groupBy(bostonDailyTemps, bostonDailyTemps$Year), 
                               Average = avg(bostonDailyTemps$avgtemp))
bostonAvgTempsByYear

In [None]:
collect(
  arrange(bostonAvgTempsByYear, bostonAvgTempsByYear$Year)
  )

In [None]:
bostonAvgTempsByMonth <- summarize(groupBy(bostonDailyTemps, bostonDailyTemps$Month), 
                                  Average = avg(bostonDailyTemps$avgtemp))
bostonAvgTempsByMonth

In [None]:
collect(
  arrange(bostonAvgTempsByMonth, bostonAvgTempsByMonth$Month)
)

In [None]:
bostonAvgTempsByYearAndMonth <- summarize(groupBy(bostonDailyTemps, bostonDailyTemps$Year, bostonDailyTemps$Month), 
                                  Average = avg(bostonDailyTemps$avgtemp))
bostonAvgTempsByYearAndMonth

In [None]:
collect(
  arrange(bostonAvgTempsByYearAndMonth, bostonAvgTempsByYearAndMonth$Year, bostonAvgTempsByYearAndMonth$Month)
)

In [None]:
bostonYears <- select(bostonDailyTemps, 'year')
bostonYears

In [None]:
distinctBostonYears <- distinct(bostonYears)
distinctBostonYears

In [None]:
yearsDF <- collect(distinct(bostonYears))
yearsDF

In [None]:
yearsDF[order(yearsDF$year), ]

In [None]:
# Stop the SparkSession now
sparkR.session.stop()