__Libraries__

In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Row, SQLContext
from pyspark.streaming import StreamingContext
from pyspark.sql.types import StructType
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql.window import Window

In [2]:
from geopy.geocoders import Nominatim
import geopy
from geopy.extra.rate_limiter import RateLimiter
geopy.geocoders.options.default_user_agent = "BDBA"
from tqdm import tqdm

__Functions__

In [3]:
def road_from_coord(vehicle):
    lat = vehicle.select(vehicle['latitude'])
    lon = vehicle.select(vehicle['longitude'])
    coordinates= str(lat)+ ','+str(lon)
    #10 mins maximum timeout to prevent being blocked
    locator = Nominatim(timeout=10)
    rgeocode = RateLimiter(locator.reverse, min_delay_seconds=0.001)
    location = rgeocode(coordinates)
    return(location.raw['address']['road'])

__Open Context__

In [4]:
sc = SparkContext(appName="SparkforEachRDDapp")
spark = SparkSession(sc)
sqlContext = SQLContext(sc)
ssc = StreamingContext(sc, 10)

__Read Data__

In [5]:
inputPath = "./vehicdir"
df=spark.readStream.format('json').schema( StructType()
                .add("vehicle_id", "string")
                .add("datetime_utc", "float")
                .add("latitude", "float")
                .add("longitude", "float") 
                .add("accelerometer", "integer")
                .add("vehicle_type", 'string')
             ).load(inputPath)

In [6]:
print(df)

DataFrame[vehicle_id: string, datetime_utc: float, latitude: float, longitude: float, accelerometer: int, vehicle_type: string]


__Transform Data__

In [7]:
query1 = df.writeStream.foreach(road_from_coord).queryName('getroads').start()
query1.status

{'message': 'Initializing sources',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [8]:
df.createOrReplaceTempView("df")
spark.sql("select * from df").show(1000, truncate=False)

AnalysisException: Queries with streaming sources must be executed with writeStream.start();;
FileSource[./vehicdir]

In [9]:
query2 = df.writeStream.foreach(lambda rdd: road_from_coord(rdd)).start()
query2.status

{'message': 'Initializing sources',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [10]:
query3 = df.foreachRDD(lambda rdd: road_from_coord(rdd))
query3.status

AttributeError: 'DataFrame' object has no attribute 'foreachRDD'

### Other Things Tested

__Read Data 1__

In [11]:
inputdir = "/work/students/Project Cars (Camila, Paula, Juan, Rocio, Oriol, Valeria)/Project/roadsdir/roads.json"

In [12]:
lines = spark.readStream.format('json').schema(StructType()
               .add("road_name", "string", True)
               .add("number_lanes", "integer")
               .add("min_speed", "integer")
               .add("has_bus_lane", "string")).option('path', "./roadsdir/roads.json").load()

In [13]:
import json

In [14]:
lines = ssc.textFileStream("data/roads.json") # process files as they appear
data = lines.map(json.loads) # map DStream and return new DStream

In [15]:
lines.pprint()

In [16]:
data.pprint()

**Read Data 2**

In [17]:
road_schema = ( StructType()
               .add("road_name", "string", True)
               .add("number_lanes", "integer")
               .add("min_speed", "integer")
               .add("has_bus_lane", "string")
             )
vehic_schema = ( StructType()
                .add("vehicle_id", "string")
                .add("datetime_utc", "float")
                .add("latitude", "float")
                .add("longitude", "float") 
                .add("accelerometer", "integer")
                .add("vehicle_type", 'string')
             )


roadstream = spark.readStream.json('./roadsdir/roads.json', schema=road_schema)
vehicstream = spark.readStream.json('./vehicdir/vehicles.json', schema=vehic_schema)

In [18]:
display(vehicstream)

DataFrame[vehicle_id: string, datetime_utc: float, latitude: float, longitude: float, accelerometer: int, vehicle_type: string]

In [19]:
vehicstream.select(vehicstream['latitude'], vehicstream['longitude']).show()

AnalysisException: Queries with streaming sources must be executed with writeStream.start();;
FileSource[./vehicdir/vehicles.json]

__Read Data 3__

In [20]:
roads = spark.read.json('roads.json')
sc.parallelize(roads)

TypeError: cannot pickle '_thread.RLock' object