In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
import folium
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext , functions
from folium import plugins


In [3]:
spark = SparkSession.builder \
.appName('ReadFromParquet') \
.master('local[6]') \
.getOrCreate()

In [4]:
spark

In [5]:
sc = spark.sparkContext
sqlContext = SQLContext(sc)

# Data Feature

## osmWay

In [6]:
sqlContext.setConf("spark.sql.parquet.binaryAsString","true")
osmWay = sqlContext.read.parquet("../data/20190531-hungary.osm.pbf.way.parquet")
osmWay = osmWay.select('id','tags','nodes')
osmWay.createOrReplaceTempView("osmWay")




In [7]:
osmWay.printSchema()

root
 |-- id: long (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- key: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- nodes: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- index: integer (nullable = true)
 |    |    |-- nodeId: long (nullable = true)



In [8]:
osmWay.show(5)

+-------+--------------------+--------------------+
|     id|                tags|               nodes|
+-------+--------------------+--------------------+
|3175810|[[highway, reside...|[[0, 15231786], [...|
|3175943|[[highway, reside...|[[0, 15234255], [...|
|3175983|[[highway, reside...|[[0, 15232076], [...|
|3192356|[[highway, second...|[[0, 1259548666],...|
|3212111|[[highway, second...|[[0, 15475952], [...|
+-------+--------------------+--------------------+
only showing top 5 rows



In [9]:
osmWay.count()

2509434

## osmNode

In [10]:
osmNode = sqlContext.read.parquet("../data/20190531-hungary.osm.pbf.node.parquet")
osmNode = osmNode.select('tags', 'latitude','longitude')
osmNode.createOrReplaceTempView("osmNode")

In [11]:
osmNode.printSchema()

root
 |-- tags: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- key: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [12]:
osmNode.show(5)

+----+------------------+------------------+
|tags|          latitude|         longitude|
+----+------------------+------------------+
|  []|        47.5135549|19.047102000000002|
|  []|        47.5135409|19.058013900000002|
|  []|        47.5146097|19.058831700000002|
|  []|        47.5166629|        19.0613926|
|  []|47.514618000000006|        19.0435418|
+----+------------------+------------------+
only showing top 5 rows



In [13]:
osmNode.count()

17916477

In [14]:
sqlContext.sql("SELECT tags,latitude, longitude FROM osmNode WHERE tags[0].key='amenity' AND tags[0].value='taxi'").show()

+--------------------+------------------+------------------+
|                tags|          latitude|         longitude|
+--------------------+------------------+------------------+
|[[amenity, taxi],...|        47.5470381|         19.028375|
|   [[amenity, taxi]]|        47.4982064|        19.0704767|
|   [[amenity, taxi]]|47.496360800000005|19.070999500000003|
|   [[amenity, taxi]]|47.507562500000006|        19.0729668|
|   [[amenity, taxi]]|47.499501300000006|        19.0646431|
|   [[amenity, taxi]]|47.782211200000006|        19.1324766|
|   [[amenity, taxi]]|        47.5420646|        19.1225384|
|   [[amenity, taxi]]|        48.2466887|        20.6166425|
|   [[amenity, taxi]]|46.353023300000004|        17.7953511|
|   [[amenity, taxi]]|46.075580200000005|        18.2048794|
|[[amenity, taxi],...|47.511096200000004|19.080156300000002|
|   [[amenity, taxi]]|        47.9000818|        20.3767527|
|   [[amenity, taxi]]|46.370169600000004|18.149269800000003|
|   [[amenity, taxi]]|47

## osmRelation

In [15]:
osmRelation = sqlContext.read.parquet("../data/20190531-hungary.osm.pbf.relation.parquet")
osmRelation = osmRelation.select('id','tags','members')
osmRelation.createOrReplaceTempView("osmRelation")

In [17]:
osmRelation.printSchema()

root
 |-- id: long (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- key: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- members: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- role: string (nullable = true)
 |    |    |-- type: string (nullable = true)



In [18]:
osmRelation.show(5)

+-----+--------------------+--------------------+
|   id|                tags|             members|
+-----+--------------------+--------------------+
|11772|[[addr:city, Buda...|[[24026306, inner...|
|11832|[[building, yes],...|[[24035233, outer...|
|11898|[[building, offic...|[[24345565, inner...|
|12697|[[addr:city, Buda...|[[24260435, outer...|
|12939|[[addr:conscripti...|[[24320909, outer...|
+-----+--------------------+--------------------+
only showing top 5 rows



In [19]:
osmRelation.count()

12983