In [10]:
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("Lab_1").setMaster("local[*]")
sc = SparkContext.getOrCreate()
# sc.setLogLevel("Error")

# This path is to the file on hdfs
temperature_file = sc.textFile("Data/temperature-readings-small.csv")
# (station, year-month-day, time, temperature, quality)
lines = temperature_file.map(lambda line: line.split(";"))

print(lines.take(10))

[['102170', '2013-11-01', '06:00:00', '6.8', 'G'], ['102170', '2013-11-01', '18:00:00', '3.8', 'G'], ['102170', '2013-11-02', '06:00:00', '5.8', 'G'], ['102170', '2013-11-02', '18:00:00', '-1.1', 'G'], ['102170', '2013-11-03', '06:00:00', '-0.2', 'G'], ['102170', '2013-11-03', '18:00:00', '5.6', 'G'], ['102170', '2013-11-04', '06:00:00', '6.5', 'G'], ['102170', '2013-11-04', '18:00:00', '5.1', 'G'], ['102170', '2013-11-05', '06:00:00', '4.2', 'G'], ['102170', '2013-11-05', '18:00:00', '3.2', 'G']]


In [2]:
## Q1
# lowest and highest temperatures measured each year for the period 1950-2014, descending.

# (key, value) = (year,temperature)
year_temp = lines.map(lambda x: (x[1][0:4], float(x[3])))
# Readings during 1950-2014
filtered_temp = year_temp.filter(lambda x: int(x[0])>=1950 and int(x[0])<=2014)

# take min => take max => join
min_temp = filtered_temp.reduceByKey(min)
max_temp = filtered_temp.reduceByKey(max)
min_max  = min_temp.join(max_temp)

# sort => combine in 1 file
min_max = min_max.sortBy(ascending = False, keyfunc=lambda k: k[1][1])
min_max = min_max.repartition(1)

# (year, (min, max)) 
print(min_max.take(10))
# min_max.saveAsTextFile("A1")

[('2014', (-24.3, 29.1)), ('2013', (-13.3, 10.2))]


In [3]:
## Q2_1
# Monthly readings higher than 10 degrees

# (year-month, (station, temperature))
monthly_temp = lines.map(lambda x: (x[1][0:7], (x[0], float(x[3]))))
# Readings during 1950-2014 & > 10
filtered_temp = monthly_temp.filter(lambda x: int(x[0][0:4]) >= 1950 and int(x[0][0:4]) <=2014 and float(x[1][1]) > 10)

# ((year-month), 1) => ((year-month), count) => combine => sort
counter = filtered_temp.map(lambda x: (x[0], 1))
monthly_count = counter.reduceByKey(lambda a,b: a+b)
RDD_combined  = monthly_count.repartition(1) 
sorted_count  = RDD_combined.sortBy(ascending = False, keyfunc=lambda k: k[1])

print(sorted_count.take(10))
# sorted_count.saveAsTextFile("A2_1")

[('2014-07', 62), ('2014-08', 56), ('2014-06', 54), ('2014-05', 31), ('2014-09', 25), ('2014-10', 17), ('2014-04', 9), ('2013-11', 1), ('2014-11', 1)]


In [4]:
## Q2_2
# Distinct readings by station per month higher than 10 degrees

# ((year-month),(station,1)) and only taking one reading per month per station
filtered_temp = monthly_temp.map(lambda x: (x[0], (x[1][0],1))).distinct() 

# ((year-month), (station, count))
dist_count = filtered_temp.reduceByKey(lambda a,b: (a[0], (a[1]+b[1])))

# map to ((year-month), (count)) => combine => sort
arranged_count = dist_count.map(lambda x: (x[0], x[1][1]))
RDD_combined = arranged_count.repartition(1) 
sorted_count = RDD_combined.sortBy(ascending = False, keyfunc=lambda k: k[1])

print(sorted_count.take(10))
# sorted_count.saveAsTextFile("A2_2")

[('2014-01', 1), ('2014-02', 1), ('2014-03', 1), ('2014-05', 1), ('2014-06', 1), ('2014-08', 1), ('2014-09', 1), ('2014-12', 1), ('2013-11', 1), ('2013-12', 1)]


In [5]:
## Q3
# Average monthly temperature

# ((year-month-day, station), (temperature))
station_temp = lines.map(lambda x: ((x[1][0:10], x[0]), (float(x[3]))))
# Readings during 1960-2014
filtered_temp = station_temp.filter(lambda x: int(x[0][0][0:4]) >= 1960 and int(x[0][0][0:4]) <=2014)

min_temp = filtered_temp.reduceByKey(min)
max_temp = filtered_temp.reduceByKey(max)
# ((year-month-day, station), (min, max))
min_max  = min_temp.join(max_temp)

# ((year-month, station), 1)
counter = min_max.map(lambda x: ((x[0][0][0:7], x[0][1]), 1))
# ((year-month, station), count)
count   = counter.reduceByKey(lambda a,b: (a+b))

# ((year-month, station), (min, max))
daily_min_max = min_max.map(lambda x: ((x[0][0][0:7], x[0][1]), (x[1])))
# ((year-month, station), (min_sum, max_sum))
min_max_sum   = daily_min_max.reduceByKey(lambda a,b: ((a[0]+b[0]), (a[1]+b[1])))

# ((year-month, station), ((min_sum, max_sum), count))
joint_RDD = min_max_sum.join(count)

# ((year-month, station), average) where average taken as (min_sum + max_sum / count * 2)
avg_temp = joint_RDD.map(lambda x: (x[0], ((x[1][0][0]+x[1][0][1])/(x[1][1]*2))))
avg_temp = avg_temp.sortBy(ascending = False, keyfunc=lambda k: k[1])

print(avg_temp.take(10))
# avg_temp.saveAsTextFile("A3")

[(('2014-07', '102170'), 19.65967741935484), (('2014-06', '102170'), 14.443333333333332), (('2014-08', '102170'), 13.869354838709679), (('2014-05', '102170'), 10.756451612903227), (('2014-09', '102170'), 8.584999999999999), (('2014-10', '102170'), 7.106451612903226), (('2014-04', '102170'), 4.776666666666667), (('2014-11', '102170'), 2.525), (('2014-03', '102170'), 1.8967741935483873), (('2013-12', '102170'), 0.7096774193548387)]


In [32]:
## Q4
# Stations with 25-30 degrees maximum temperature and 100-200mm maximum percipitation

# This path is to the file on hdfs
temperature_file = sc.textFile("Data/temperature-readings-small.csv")
# (station, year-month-day, time, temperature, quality)
lines = temperature_file.map(lambda line: line.split(";"))

# (station, temperature)
station_temp = lines.map(lambda x: (x[0], float(x[3])))
max_temp = station_temp.reduceByKey(max)
# Maximum temperature between 25 and 30 degrees
filtered_max_temp = max_temp.filter(lambda x: float(x[1])>=25 and float(x[1])<=30)

percipitation_file = sc.textFile("Data/precipitation-readings.csv")
# (station, year-month-day, time, percipitation, quality)
perc_lines = percipitation_file.map(lambda line: line.split(";"))

# (station, percipitation)
station_perc = perc_lines.map(lambda x: ((x[0], x[1]), (float(x[3]))))
perc_sum = station_perc.reduceByKey(lambda a,b: (a+b))
# Maximum percipitation between 100 and 200mm
filtered_max_perc = perc_sum.filter(lambda x: x[1]>=100 and x[1]<=200).map(lambda x: (x[0][0], x[1]))

result = filtered_max_temp.join(filtered_max_perc)
result = result.repartition(1)

print(perc_sum.take(10))
# max_temp.saveAsTextFile("A3")

[(('103100', '1995-08-30'), 0.0), (('103100', '1995-08-31'), 0.0), (('103100', '1995-09-03'), 3.2), (('103100', '1995-09-04'), 12.3), (('103100', '1995-09-11'), 3.0000000000000004), (('103100', '1995-09-12'), 0.4), (('103100', '1995-09-14'), 0.0), (('103100', '1995-09-19'), 0.0), (('103100', '1995-09-20'), 0.3), (('103100', '1995-09-22'), 0.0)]


In [51]:


# This path is to the file on hdfs
stations_file = sc.textFile("Data/stations-Ostergotland.csv")
station_lines = stations_file.map(lambda line: line.split(";"))
# (stations)
stations = station_lines.map(lambda x: x[0]).collect()

percipitation_file = sc.textFile("Data/precipitation-readings.csv")
# (station, year-month-day, time, percipitation, quality)
perc_lines = percipitation_file.map(lambda line: line.split(";"))

# ((station, year, month), (percipitation, 1))
prec_rdd = perc_lines.map(lambda x: ((x[0], x[1][0:4], x[1][5:7]), (float(x[3]), 1)))
precByStation = prec_rdd.filter(lambda x: x[0][0] in stations and int(x[0][1])>=1993 and int(x[0][1])<=2016)
#precByStation = precByStation.map(lambda x: ((x[0][1], x[0][2]), x[1]))
#precByStation = precByStation.reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1])).map(lambda x: (x[0], x[1][0]/x[1][1])).sortByKey(False)
#sortByKey(False)

print(precByStation.take(10))
#stations.saveAsTextFile("BDA/output/Lab_1/A5")

[]
