In [5]:
from pprint import pprint
from pyspark.sql import SparkSession

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.1.19:7077") \
        .appName("read_weather_data")\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

data_frame = spark_session.read\
    .option("header", "true")\
    .csv('hdfs://192.168.1.19:9000/weather.csv')\
    .cache()

In [6]:
spark_context.uiWebUrl

'http://ben-uppmax-haste-spark-master:4040'

In [7]:
# Count the files:
#$ find -type f | wc -l
# 2546

# Count the lines in 1 file:
# $cat 10224099999.csv | wc -l
# 8756

# Count the CSV rows:
data_frame.count()

8673817

In [8]:
data_frame.show()

+-----------+-------------------+------+--------+----------+---------+---------------+-----------+---------+---------------+--------------+-----------+------------+-------+-------+-------+-----------+--------+--------------------+-------------+----+----+
|    STATION|               DATE|SOURCE|LATITUDE| LONGITUDE|ELEVATION|           NAME|REPORT_TYPE|CALL_SIGN|QUALITY_CONTROL|           WND|        CIG|         VIS|    TMP|    DEW|    SLP|        AA1|     AY1|                 GF1|          KA1| MW1| EQD|
+-----------+-------------------+------+--------+----------+---------+---------------+-----------+---------+---------------+--------------+-----------+------------+-------+-------+-------+-----------+--------+--------------------+-------------+----+----+
|01199099999|1950-01-01T07:00:00|     4|   68.75|23.5333333|    382.0|SIHCCAJAVRI, NO|      FM-12|    99999|           V020|160,1,N,0026,1|99999,9,9,N|999999,9,N,9|-0161,1|+9999,9|99999,9|99,0020,9,1|7,1,99,9|08,99,1,99,9,99,9...|999,N

In [9]:
data_frame.printSchema()

root
 |-- STATION: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- SOURCE: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- ELEVATION: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- REPORT_TYPE: string (nullable = true)
 |-- CALL_SIGN: string (nullable = true)
 |-- QUALITY_CONTROL: string (nullable = true)
 |-- WND: string (nullable = true)
 |-- CIG: string (nullable = true)
 |-- VIS: string (nullable = true)
 |-- TMP: string (nullable = true)
 |-- DEW: string (nullable = true)
 |-- SLP: string (nullable = true)
 |-- AA1: string (nullable = true)
 |-- AY1: string (nullable = true)
 |-- GF1: string (nullable = true)
 |-- KA1: string (nullable = true)
 |-- MW1: string (nullable = true)
 |-- EQD: string (nullable = true)



In [10]:
# 'old' RDD API underneath...
data_frame.rdd.take(1)

[Row(STATION='01199099999', DATE='1950-01-01T07:00:00', SOURCE='4', LATITUDE='68.75', LONGITUDE='23.5333333', ELEVATION='382.0', NAME='SIHCCAJAVRI, NO', REPORT_TYPE='FM-12', CALL_SIGN='99999', QUALITY_CONTROL='V020', WND='160,1,N,0026,1', CIG='99999,9,9,N', VIS='999999,9,N,9', TMP='-0161,1', DEW='+9999,9', SLP='99999,9', AA1='99,0020,9,1', AY1='7,1,99,9', GF1='08,99,1,99,9,99,9,99999,9,99,9,99,9', KA1='999,N,-0228,1', MW1='73,1', EQD=None)]

In [11]:
data_frame.rdd.getNumPartitions()

21

In [12]:
# .distinct()
data_frame.select('NAME').distinct().take(10)

[Row(NAME='BOLGRAD, UP'),
 Row(NAME='ZHANGIZTOBE, KZ'),
 Row(NAME='KSARA LEBABON, LE'),
 Row(NAME='GREAT FALLS AIRPORT, MT US'),
 Row(NAME='WINSTON SALEM REYNOLDS AIRPORT, NC US'),
 Row(NAME='PINE BLUFF GRIDER FIELD, AR US'),
 Row(NAME='PAVELETS, RS'),
 Row(NAME='PORT SWEETENHAM, MY'),
 Row(NAME='BLIDA, AG'),
 Row(NAME='ES SENIA, AG')]

In [13]:
# .count()
data_frame.select('NAME').distinct().count()

2525

In [14]:
# .take() (like .show())
data_frame.filter('NAME == "PRESTWICK, UK"').select('WND').take(10)

[Row(WND='130,1,N,0026,1'),
 Row(WND='135,1,N,0026,1'),
 Row(WND='180,1,N,0036,1'),
 Row(WND='180,1,N,0036,1'),
 Row(WND='140,1,N,0021,1'),
 Row(WND='135,1,N,0021,1'),
 Row(WND='170,1,N,0026,1'),
 Row(WND='180,1,N,0026,1'),
 Row(WND='170,1,N,0026,1'),
 Row(WND='180,1,N,0026,1')]

In [16]:
# Convert a column

import pyspark
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def WND_to_WND_SPEED_MS(wnd):
  s = wnd.split(',')
  if len(s) < 2:
        return 999
  return float(s[3])/10

# User-defined function. Input type is a string.
udf_WND_to_WND_SPEED_MS = udf(WND_to_WND_SPEED_MS, StringType())


data_frame_with_wnd_speed = data_frame.withColumn("WND_SPEED_MS",udf_WND_to_WND_SPEED_MS("WND"))



# 9999: missing (with scale factor of 10)
data_frame_with_wnd_speed = data_frame_with_wnd_speed.filter(data_frame_with_wnd_speed['WND_SPEED_MS'] < 999)

#wnd_split = pyspark.sql.functions.split(data_frame['WND'], ',')
#data_frame_with_wnd_speed = data_frame.withColumn('WND_SPEED_MS', wnd_split.getItem(3))

data_frame_with_wnd_speed.select('WND', 'WND_SPEED_MS').show()

data_frame_with_wnd_speed.select('WND_SPEED_MS').summary().show()

+--------------+------------+
|           WND|WND_SPEED_MS|
+--------------+------------+
|160,1,N,0026,1|         2.6|
|140,1,N,0026,1|         2.6|
|090,1,N,0026,1|         2.6|
|100,1,N,0010,1|         1.0|
|100,1,N,0010,1|         1.0|
|180,1,N,0010,1|         1.0|
|250,1,N,0026,1|         2.6|
|240,1,N,0026,1|         2.6|
|090,1,N,0010,1|         1.0|
|200,1,N,0010,1|         1.0|
|240,1,N,0026,1|         2.6|
|270,1,N,0026,1|         2.6|
|040,1,N,0046,1|         4.6|
|040,1,N,0026,1|         2.6|
|040,1,N,0010,1|         1.0|
|290,1,N,0010,1|         1.0|
|090,1,N,0026,1|         2.6|
|210,1,N,0010,1|         1.0|
|300,1,N,0010,1|         1.0|
|210,1,N,0010,1|         1.0|
+--------------+------------+
only showing top 20 rows

+-------+------------------+
|summary|      WND_SPEED_MS|
+-------+------------------+
|  count|           8601428|
|   mean|4.0470310511293395|
| stddev|3.2041228109591433|
|    min|               0.0|
|    25%|               1.5|
|    50%|             

In [17]:
# .agg(), .orderby()  -- where is the avg most windy place?

most_windy = data_frame_with_wnd_speed.groupby('NAME')\
    .agg({'WND_SPEED_MS': 'mean'})\
    .orderBy('avg(WND_SPEED_MS)', ascending=False)\

# Rename a column
most_windy = most_windy.withColumnRenamed('avg(WND_SPEED_MS)','AVG_WND_SPEED_MS')
    
most_windy.show()

+--------------------+------------------+
|                NAME|  AVG_WND_SPEED_MS|
+--------------------+------------------+
|  VESTMANNAEYJAR, IC|            11.555|
| CAMPBELL ISLAND, NZ|10.924466571835005|
|    GRAN CANARIA, SP|10.658823529411762|
|       LANZAROTE, SP|10.207692307692309|
|AMCHITKA ISLAND, ...| 10.05585070611949|
|MOUNT LAGUNA CAA ...|  9.96358447488591|
|FELDBERG SCHWARZW...|   9.5155817174515|
|LUDERITZ DIAZ POI...| 9.252941176470593|
|PINE SPRINGS GUAD...| 8.779119561493236|
|   ILE AMSTERDAM, FS| 8.772727272727295|
|        WALES, AK US| 8.759103385178399|
|   MARION ISLAND, SF|  8.67556008146638|
|           MATUA, RS| 8.636529680365339|
|    ARGENTIA AUT, CA| 8.558078552616378|
|       CASPER, WY US|  8.35883822610875|
|           ASSAB, ER| 8.353956834532378|
|  BUHTA GAVRIILA, RS| 8.194366197183165|
|   CAPE CAMPBELL, NZ| 8.194150417827288|
|       MOGADISHU, SO| 8.189841269841274|
|       KANIN NOS, RS| 8.145466155810983|
+--------------------+------------

In [18]:
most_windy.limit(20).write.format('csv').save('hdfs://192.168.1.19:9000/weather-output.csv')

In [None]:
# output CSV as ~200 files.
# most_windy.write.format('csv').save('/mnt/nfs/ben-spark-master/teaching/noaa/output-all.csv')