In [2]:
// Starting Spark Job
val sqlContext = new org.apache.spark.sql.SQLContext(sc)

sqlContext: org.apache.spark.sql.SQLContext = org.apache.spark.sql.SQLContext@667da921


In [3]:
// Load files to DataFrame
val weather = spark.read
                .option("header", true)
                .option("inferSchema", true)
                .csv("data/2019")
                .cache()

val stationList = spark.read
                .option("header", true)
                .option("inferSchema", true)
                .csv("stationlist.csv")

val countryList = spark.read
                .option("header", true)
                .option("inferSchema", true)
                .csv("countrylist.csv")

weather: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [STN---: int, WBAN: int ... 14 more fields]
stationList: org.apache.spark.sql.DataFrame = [STN_NO: string, COUNTRY_ABBR: string]
countryList: org.apache.spark.sql.DataFrame = [COUNTRY_ABBR: string, COUNTRY_FULL: string]


In [4]:
// Show schemas and sample data
weather.printSchema()
stationList.printSchema()
countryList.printSchema()
weather.show(2, false)
stationList.show(2, false)
countryList.show(2, false)


root
 |-- STN---: integer (nullable = true)
 |-- WBAN: integer (nullable = true)
 |-- YEARMODA: integer (nullable = true)
 |-- TEMP: double (nullable = true)
 |-- DEWP: double (nullable = true)
 |-- SLP: double (nullable = true)
 |-- STP: double (nullable = true)
 |-- VISIB: double (nullable = true)
 |-- WDSP: double (nullable = true)
 |-- MXSPD: double (nullable = true)
 |-- GUST: double (nullable = true)
 |-- MAX: string (nullable = true)
 |-- MIN: string (nullable = true)
 |-- PRCP: string (nullable = true)
 |-- SNDP: double (nullable = true)
 |-- FRSHTT: integer (nullable = true)

root
 |-- STN_NO: string (nullable = true)
 |-- COUNTRY_ABBR: string (nullable = true)

root
 |-- COUNTRY_ABBR: string (nullable = true)
 |-- COUNTRY_FULL: string (nullable = true)

+------+-----+--------+----+----+------+------+-----+----+-----+----+-----+-----+-----+----+------+
|STN---|WBAN |YEARMODA|TEMP|DEWP|SLP   |STP   |VISIB|WDSP|MXSPD|GUST|MAX  |MIN  |PRCP |SNDP|FRSHTT|
+------+-----+--------+---

In [5]:
// Prepare station - country DF
val stationCountry = stationList.join(countryList, stationList("COUNTRY_ABBR") === countryList("COUNTRY_ABBR"))
            .select("STN_NO", "COUNTRY_FULL")
            .cache()

stationCountry: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [STN_NO: string, COUNTRY_FULL: string]


In [6]:
// Get Min and Max Value of Date
// Make sure there is no out-of-range date
val dateDF = weather.agg(min("YEARMODA"), max("YEARMODA")).show


        

+-------------+-------------+
|min(YEARMODA)|max(YEARMODA)|
+-------------+-------------+
|     20190101|     20200101|
+-------------+-------------+



dateDF: Unit = ()


In [7]:
/* 

Q1: Which country had the hottest average mean temperature over the year? 

*/


val maxMeanTemp = weather.select("STN---", "TEMP")
        .filter($"TEMP" < 9999.9)
        .groupBy("STN---")
        .agg(avg("TEMP").as("avg_temp"))
        .orderBy(desc("avg_temp"))
        .limit(1)
        .cache()
        
val maxMeanTempCountry = maxMeanTemp.join(stationCountry, maxMeanTemp("STN---") === stationCountry("STN_NO"))
                            .select("COUNTRY_FULL")
                            .show  


+------------+
|COUNTRY_FULL|
+------------+
|SAUDI ARABIA|
+------------+



maxMeanTemp: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [STN---: int, avg_temp: double]
maxMeanTempCountry: Unit = ()


In [8]:
/* 

NOT COMPLETE

Q2: Which country had the most consecutive days of tornadoes/funnel cloud formations?

*/


// val funnelCloud = weather.select("STN---", "YEARMODA", "FRSHTT")

// val country = funnelCloud.join(stationCountry, secondHighestWDSP("STN---") === stationCountry("STN_NO"))
//                             .filter(($"FRSHTT" / 10) % 10 === 1)
//                             .repartition($"COUNTRY_FULL")
//                             .orderBy("YEARMODA")
//                             .show   


In [9]:
/* 

Q3: Which country had the second highest average mean wind speed over the year?

*/

val secondHighestWDSP = weather.select("STN---", "WDSP")
        .filter($"WDSP" < 999.9)
        .groupBy("STN---")
        .agg(avg("WDSP").as("avg_wdsp"))
        .orderBy(desc("avg_wdsp"))
        .limit(2)
        .orderBy(asc("avg_wdsp"))
        .limit(1)
        .cache()
        
val secondHighestWDSPCountry = secondHighestWDSP.join(stationCountry, secondHighestWDSP("STN---") === stationCountry("STN_NO"))
                            .select("COUNTRY_FULL")
                            .show   

+------------+
|COUNTRY_FULL|
+------------+
|  ANTARCTICA|
+------------+



secondHighestWDSP: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [STN---: int, avg_wdsp: double]
secondHighestWDSPCountry: Unit = ()
