# Calculating moving "min" of the data

Required methods are:

- ``filter`` to only keep the entries that satisfy a condition
- ``map`` to keep a slice of the data set
- `` reduceByKey`` to calculate a running "min" of the values over each key.

In [3]:
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("MinTemperatures")
sc = SparkContext(conf = conf)

def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0 #calculate in Farengeit
    return (stationID, entryType, temperature)

lines = sc.textFile("file:///SparkCourse/filtering/1800.csv")
parsedLines = lines.map(parseLine)
minTemps = parsedLines.filter(lambda x: "TMIN" in x[1]) # only keep entries with TMIN type
stationTemps = minTemps.map(lambda x: (x[0], x[2])) # drop MINTEMP as it is common in all cases
minTemps = stationTemps.reduceByKey(lambda x, y: min(x,y)) # calculates a moving min
results = minTemps.collect(); #reportts the min value

for result in results:
    print(result[0] + "\t{:.2f}F".format(result[1]))

ITE00100554	5.36F
EZE00100082	7.70F
