In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=1dd84ad257c5c8cf7bcbc0df3c42a59f0020321531ba735da7af626e95330169
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [12]:
from pyspark import SparkConf, SparkContext 
from google.colab import files
from datetime import datetime
import sys
import os

def edit_value(v):
  #retrieve hh
  t = v.split('\t')[1].split(' ')[1].split(':')[0]
  return (t,v.split('\t')[3])

def checkFull(line):
    # station\ttimestamp\tused\tfree
    # 1\t2008-05-15 12:01:00\t0\t18
    fields = line.split("\t")
    stationId = fields[0]
    freeSlots = int(fields[3])
    timestamp = fields[1]
    
    datetimeObject = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")    
    dayOfTheWeek = datetimeObject.strftime("%A")
    hour = datetimeObject.hour

    if freeSlots == 0:
        # The station is full
        countTotReadingsTotFull = (1, 1)
    else:
        countTotReadingsTotFull = (1, 0)
        
    return ((stationId, dayOfTheWeek, hour), countTotReadingsTotFull)

def compare(a, b):
  #compare criticality
  if a[2]>b[2]:
    return a
  elif b[2]>a[2]:
    return b
  
  #compare hours
  if a[1]<b[1]:
    return a
  elif b[1]<a[1]:
    return b
  
  #compare days by lexicographical order
  if a[0]<b[0]:
    return a
  return b

def extractStationLongLat(line):
    fields = line.split("\t")
    
    return (fields[0], (fields[1] ,fields[2]) )

def formatKMLMarker(pair):
    # input
    # (stationId, ( (weekday, hour, criticality), (long, lat) ) )
    stationId = pair[0]
    
    weekday = pair[1][0][0]
    hour = pair[1][0][1]
    criticality = pair[1][0][2]
    coordinates = pair[1][1][0]+","+pair[1][1][1]
    
    result = "<Placemark><name>" + stationId + "</name>" + "<ExtendedData>"\
    + "<Data name=\"DayWeek\"><value>" + weekday + "</value></Data>"\
    + "<Data name=\"Hour\"><value>" + str(hour) + "</value></Data>"\
    + "<Data name=\"Criticality\"><value>" + str(criticality) + "</value></Data>"\
    + "</ExtendedData>" + "<Point>" + "<coordinates>" + coordinates + "</coordinates>"\
    + "</Point>" + "</Placemark>"
    
    return result

if __name__=="__main__":
  threshold = 0.6

  conf = SparkConf().setAppName("Spark Lab07")
  sc = SparkContext.getOrCreate(conf=conf)

  registerRDD = sc.textFile('registerSample.csv').filter(lambda line: line.startswith('station'))

  # filter
  registerRDD = registerRDD.filter(lambda l: int(l.split('\t')[2])!=0 and int(l.split('\t')[3])!=0)

  stationWeekDayHour = registerRDD.map(checkFull)
  stationWeekDayHourCounts = stationWeekDayHour.reduceByKey(lambda p1, p2: (p1[0]+p2[0], p1[1]+p2[1]))
  stationWeekDayHourCriticality = stationWeekDayHourCounts.mapValues(lambda value: value[1]/value[0])
  selectedPairs = stationWeekDayHourCriticality.filter(lambda pair: pair[1]>= threshold)

  stationTimeslotCrit = selectedPairs.map(lambda s: (s[0][0],(s[0][1],s[0][2],s[1])))
  resultRDD = stationTimeslotCrit.reduceByKey(compare)

  stationLocation = sc.textFile('stations.csv').map(extractStationLongLat)
  resultLocations = resultRDD.join(stationLocation)
  resultKML = resultLocations.map(formatKMLMarker)
  sc.stop()