### **Exercise 46**
Consider all the windows containing 3 consecutive temperature readings. Select windows characterized by increasing trend. A window is characterized by an increasing trend if for all the temperature readings in it:

**temperature(t) > temperature (t - 60s)**

In [4]:
inWindows = "/data/students/bigdata-01QYD/ex_data/Ex46/data/"

# Read the content of the DataFrame
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

readingsDF = spark.read.load(inWindows,\
                            format='csv',\
                            header=False,\
                            inferSchema=True)\
.withColumnRenamed('_c0','timestamp')\
.withColumnRenamed('_c1','temperature')

In [5]:
readingsDF.printSchema()
readingsDF.show()

root
 |-- timestamp: integer (nullable = true)
 |-- temperature: double (nullable = true)

+----------+-----------+
| timestamp|temperature|
+----------+-----------+
|1451606400|       12.1|
|1451606460|       12.2|
|1451606520|       13.5|
|1451606580|       14.0|
|1451606640|       14.0|
|1451606700|       15.5|
|1451606760|       15.0|
+----------+-----------+



In [6]:
# Transform DF in RDD
readingsRDD = readingsDF.rdd
readingsDF.collect()

[Row(timestamp=1451606400, temperature=12.1),
 Row(timestamp=1451606460, temperature=12.2),
 Row(timestamp=1451606520, temperature=13.5),
 Row(timestamp=1451606580, temperature=14.0),
 Row(timestamp=1451606640, temperature=14.0),
 Row(timestamp=1451606700, temperature=15.5),
 Row(timestamp=1451606760, temperature=15.0)]

In [7]:
# Generate the elements of each window
# Each reading with start time t belongs to 3 windows
# - the one starting at time t-120s
# - the one starting at time t-60s
# - the one starting at time ts

def windowElementsFunc(reading):
    # time stamp of this reading
    t = reading['timestamp']
    # temperature
    temperature = reading['temperature']
    
    # the current reading, associated with time stamp t,
    # is part of the windows starting at time t, t-60, t-120
    
    # pairs is a list containng three pairs
    # the three windows containing this reading
    pairs = []
    
    # Window starting a time t
    pairs.append((t, reading))
    
    # Window starting a time t
    pairs.append((t-60, reading))
    
    # Window starting a time t
    pairs.append((t-120, reading))
    
    return pairs

In [8]:
windowsElementsRDD = readingsRDD\
.flatMap(windowElementsFunc)

In [9]:
windowsElementsRDD.collect()

[(1451606400, Row(timestamp=1451606400, temperature=12.1)),
 (1451606340, Row(timestamp=1451606400, temperature=12.1)),
 (1451606280, Row(timestamp=1451606400, temperature=12.1)),
 (1451606460, Row(timestamp=1451606460, temperature=12.2)),
 (1451606400, Row(timestamp=1451606460, temperature=12.2)),
 (1451606340, Row(timestamp=1451606460, temperature=12.2)),
 (1451606520, Row(timestamp=1451606520, temperature=13.5)),
 (1451606460, Row(timestamp=1451606520, temperature=13.5)),
 (1451606400, Row(timestamp=1451606520, temperature=13.5)),
 (1451606580, Row(timestamp=1451606580, temperature=14.0)),
 (1451606520, Row(timestamp=1451606580, temperature=14.0)),
 (1451606460, Row(timestamp=1451606580, temperature=14.0)),
 (1451606640, Row(timestamp=1451606640, temperature=14.0)),
 (1451606580, Row(timestamp=1451606640, temperature=14.0)),
 (1451606520, Row(timestamp=1451606640, temperature=14.0)),
 (1451606700, Row(timestamp=1451606700, temperature=15.5)),
 (1451606640, Row(timestamp=1451606700, 

In [12]:
timestampsWindowsRDD = windowsElementsRDD.groupByKey()

timestampsWindowsRDD\
.mapValues(lambda v: (list(v))).collect()

[(1451606400,
  [Row(timestamp=1451606400, temperature=12.1),
   Row(timestamp=1451606460, temperature=12.2),
   Row(timestamp=1451606520, temperature=13.5)]),
 (1451606340,
  [Row(timestamp=1451606400, temperature=12.1),
   Row(timestamp=1451606460, temperature=12.2)]),
 (1451606280, [Row(timestamp=1451606400, temperature=12.1)]),
 (1451606460,
  [Row(timestamp=1451606460, temperature=12.2),
   Row(timestamp=1451606520, temperature=13.5),
   Row(timestamp=1451606580, temperature=14.0)]),
 (1451606520,
  [Row(timestamp=1451606520, temperature=13.5),
   Row(timestamp=1451606580, temperature=14.0),
   Row(timestamp=1451606640, temperature=14.0)]),
 (1451606580,
  [Row(timestamp=1451606580, temperature=14.0),
   Row(timestamp=1451606640, temperature=14.0),
   Row(timestamp=1451606700, temperature=15.5)]),
 (1451606640,
  [Row(timestamp=1451606640, temperature=14.0),
   Row(timestamp=1451606700, temperature=15.5),
   Row(timestamp=1451606760, temperature=15.0)]),
 (1451606700,
  [Row(times

In [13]:
windowsRDD = timestampsWindowsRDD.values()

In [14]:
import sys
# Pay attention that some lists are not complete, we need to discard them

# This function is used to select the windows that are increasing
def increasingTrendFunc(window): # window is the 3 or less elements list
    
    # store the (at most) 3 elements of the window in a dictionary
    # containing enties time stamp -> temperature
    timestampTemp = {}
    increasing = False
    
    # Compute also the info about the minimum timestamp
    minTimestamp = sys.maxsize
    
    # iterates over elements inside the list
    for timestampTemperature in window:
        t = timestampTemperature['timestamp'] # timestamp
        temperature = timestampTemperature['temperature'] # temperature
        
        timestampTemp[t] = temperature # fill the dictionary
        
        if t < minTimestamp:
            minTimestamp = t
            
    # Check if the list contains three elements
    # If the number of elements is not equal to 3 the window is incomplete
    if len(timestampTemp) != 3:
        increasing = False
    else:
        # Check if increasing trend is satisfied
        if timestampTemp[minTimestamp] < timestampTemp[minTimestamp+60] and\
            timestampTemp[minTimestamp+60] < timestampTemp[minTimestamp+120]:
            increasing = True
        else:
            incresing = False
            
    return increasing

In [15]:
selectedWindowRDD = windowsRDD.filter(increasingTrendFunc)

In [17]:
selectedWindowRDD\
.map(lambda v: (list(v))).collect()

[[Row(timestamp=1451606400, temperature=12.1),
  Row(timestamp=1451606460, temperature=12.2),
  Row(timestamp=1451606520, temperature=13.5)],
 [Row(timestamp=1451606460, temperature=12.2),
  Row(timestamp=1451606520, temperature=13.5),
  Row(timestamp=1451606580, temperature=14.0)]]