In [1]:
# Exercise 46

Consider all the windows containing 3 consecutive temperature readings. Select windows characterized by increasing trend. A window is characterized by an increasing trend if for all the temperature readings in it:

**temperature(t) > temperature (t - 60s)**

In [28]:
sc
inWindows = "/data/students/bigdata-01QYD/ex_data/Ex46/data/"
outputPath = "/res_out_ex46/"

In [5]:
logRDD = sc.textFile(inWindows)
logRDD.top(2)

['1451606760,15.0', '1451606700,15.5']

It would be a smart approach to use a **flatMap** to create an RDD with keys associates with timestamps. Ate the end we will use a **groupByKey** in order to put together all the windows. It could be solved also with join operation, but it is suitable only for two elements per line, no more. Use always this che vai sicuro.

In [10]:
# Generate elements of each window
# Each reading with start time t belongs to 3 windows with a window size equivalent to 120s
# - The one starting at time t-120s
# - The one starting at time t-60s
# - The one starting at time t

def windowElementsFunc(reading):
    fields = reading.split(',')
    
    # The time stamp of this reading
    t = int(fields[0])
    # Temperature
    temperature = float(fields[1])
    
    # The current reading, associated with time stamp t,
    # is part of the windows starting at time t, t-60s, t-120s
    
    # pairs is a list containing three pairs (window start tmstp, current tmstp)
    # the three windows containing this reading
    pairs = []
    
    # Window starting a time t
    pairs.append((t, reading))
    
    # Window starting a time t
    pairs.append((t-60, reading))
    
    # Window starting a time t
    pairs.append((t-120, reading))
    
    return pairs

In [11]:
windowElementsRDD = logRDD.flatMap(windowElementsFunc)

In [12]:
windowElementsRDD.collect()

[(1451606400, '1451606400,12.1'),
 (1451606340, '1451606400,12.1'),
 (1451606280, '1451606400,12.1'),
 (1451606460, '1451606460,12.2'),
 (1451606400, '1451606460,12.2'),
 (1451606340, '1451606460,12.2'),
 (1451606520, '1451606520,13.5'),
 (1451606460, '1451606520,13.5'),
 (1451606400, '1451606520,13.5'),
 (1451606580, '1451606580,14.0'),
 (1451606520, '1451606580,14.0'),
 (1451606460, '1451606580,14.0'),
 (1451606640, '1451606640,14.0'),
 (1451606580, '1451606640,14.0'),
 (1451606520, '1451606640,14.0'),
 (1451606700, '1451606700,15.5'),
 (1451606640, '1451606700,15.5'),
 (1451606580, '1451606700,15.5'),
 (1451606760, '1451606760,15.0'),
 (1451606700, '1451606760,15.0'),
 (1451606640, '1451606760,15.0')]

If we wanted to count if the timestamps were complete, at this stap, assigning 1 instead of **'timestamp, temperature'** to each line, we could use **reduceByKey()** and solve the probelm.

In [15]:
# Use groupByKey to generate one sequence for each time stamp
timestampsWindowsRDD = windowElementsRDD.groupByKey()

# JUST FOR DEBUG otherwise lists are not human readable
timestampsWindowsRDD.mapValues(lambda l: list(l)).collect()

[(1451606400, ['1451606400,12.1', '1451606460,12.2', '1451606520,13.5']),
 (1451606340, ['1451606400,12.1', '1451606460,12.2']),
 (1451606280, ['1451606400,12.1']),
 (1451606460, ['1451606460,12.2', '1451606520,13.5', '1451606580,14.0']),
 (1451606520, ['1451606520,13.5', '1451606580,14.0', '1451606640,14.0']),
 (1451606580, ['1451606580,14.0', '1451606640,14.0', '1451606700,15.5']),
 (1451606640, ['1451606640,14.0', '1451606700,15.5', '1451606760,15.0']),
 (1451606700, ['1451606700,15.5', '1451606760,15.0']),
 (1451606760, ['1451606760,15.0'])]

In [16]:
# I care only about the values, I discard the keys
windowsRDD = timestampsWindowsRDD.values()

In [26]:
import sys
# Pay attention that some lists are not complete, we need to discard them

# This function is used to select the windows that are increasing
def increasingTrendFunc(window): # window is the 3 or less elements list
    
    # store the (at most) 3 elements of the window in a dictionary
    # containing enties time stamp -> temperature
    timestampTemp = {}
    increasing = False
    
    # Compute also the info about the minimum timestamp
    minTimestamp = sys.maxsize
    
    # iterates over elements inside the list
    for timestampTemperature in window:
        fields = timestampTemperature.split(',')
        t = int(fields[0]) # timestamp
        temperature = float(fields[1]) # temperature
        
        timestampTemp[t] = temperature # fill the dictionary
        
        if t < minTimestamp:
            minTimestamp = t
            
    # Check if the list contains three elements
    # If the number of elements is not equal to 3 the window is incomplete
    if len(timestampTemp) != 3:
        increasing = False
    else:
        # Check if increasing trend is satisfied
        if timestampTemp[minTimestamp] < timestampTemp[minTimestamp+60] and\
            timestampTemp[minTimestamp+60] < timestampTemp[minTimestamp+120]:
            increasing = True
        else:
            incresing = False
            
    return increasing

In [27]:
selectedWindowsRDD = windowsRDD.filter(increasingTrendFunc)

# Debug
selectedWindowsRDD.map(lambda v: list(v)).collect()

[['1451606400,12.1', '1451606460,12.2', '1451606520,13.5'],
 ['1451606460,12.2', '1451606520,13.5', '1451606580,14.0']]

In [31]:
# Store result. Map the iterable assocuated with each window to a list
savefile = selectedWindowsRDD.map(lambda window: list(window))
#savefile.saveAsText(outputPath)