In [0]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.elasticsearch:elasticsearch-hadoop:7.4.2 pyspark-shell'

# IP = 'da2019w-1019.eastus.cloudapp.azure.com'
IP = '10.0.0.25'
# GOOGLE_API_KEY = 'AIzaSyBSp6bqrg9ijhLKXAkn5Rt4BrPpnnpv2d8'
HERE_API_KEY = 'zIvimm4hN9bfwzhEz-6BNn8tZyhxWU2762k0am6fIq4'

from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import VectorAssembler
from elasticsearch import Elasticsearch, helpers
import requests


spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "1g") \
    .getOrCreate()

spark.conf.set("spark.sql.session.timeZone", "GMT")

es = Elasticsearch([{'host': IP}])

## Helper Functions:

In [2]:

def read_elastic(index, query="", scroll_size="10000", array_field=""):
    if not es.indices.exists(index):
        raise Exception("Index doesn't exist!")

    return spark.read\
                .format("org.elasticsearch.spark.sql")\
                .option("es.nodes.wan.only","true")\
                .option("es.port","9200")\
                .option("es.nodes",IP)\
                .option("es.nodes.client.only", "false")\
                .option("pushdown", "true")\
                .option("es.query", query)\
                .option("es.scroll.size", scroll_size)\
                .option("es.scroll.keepalive", "120m")\
                .option("es.read.field.as.array.include", array_field)\
                .load(index)

        
DEFUALT_SCEHMA = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "actualDelay" : { "type": "long" },
            "areaId" : { "type": "long" },
            "areaId1" : { "type": "long" },
            "areaId2" : { "type": "long" },
            "areaId3" : { "type": "long" },
            "atStop" : { "type": "boolean" },
            "busStop" : { "type": "long" },
            "congestion" : { "type": "boolean" },
            "gridID" : { "type": "keyword" },
            "journeyPatternId" : { "type": "keyword" },
            "lineId" : { "type": "keyword" },
            "coordinates" : { "type": "geo_point" },
            "timestamp" : { "type": "date", "format" : "epoch_millis" },
            "vehicleId" : { "type": "long" },
            "dateTime" : { "type": "date" }
        }
    }
}

def write_to_elastic(df, index: str, settings=DEFUALT_SCEHMA, append=True):
    if es.indices.exists(index) and not append:
        es.indices.delete(index=index)
    
    es.indices.create(index=index, ignore=400, body=settings)

    df.write.format("org.elasticsearch.spark.sql")\
        .option("es.resource", index)\
        .option("es.nodes.wan.only","true")\
        .option("es.port","9200")\
        .option("es.nodes",IP)\
        .option("es.nodes.client.only", "false")\
        .save()



def calculate_centroids(df):
    centroid_df = df.groupBy('busStop')\
                    .agg(F.mean(df.coordinates[0]).alias('centroid_longitude'), 
                            F.mean(df.coordinates[1]).alias('centroid_latitude'))

    centroid_df = centroid_df.withColumn("coordinates", F.array('centroid_longitude', 'centroid_latitude'))\
                                .drop('centroid_longitude', 'centroid_latitude')
    return centroid_df

from math import radians, cos, sin, asin, sqrt

@F.udf("float")
def get_distance(coord_a, coord_b):
    longit_a, latit_a = coord_a
    longit_b, latit_b = coord_b
    if None in [longit_a, latit_a, longit_b, latit_b]:
        return 9999
    # Transform to radians
    longit_a, latit_a, longit_b, latit_b = map(radians, [longit_a,  latit_a, longit_b, latit_b])
    dist_longit = longit_b - longit_a
    dist_latit = latit_b - latit_a
    # Calculate area
    area = sin(dist_latit/2)**2 + cos(latit_a) * cos(latit_b) * sin(dist_longit/2)**2
    # Calculate the central angle
    central_angle = 2 * asin(sqrt(area))
    radius = 6371
    # Calculate Distance
    distance = central_angle * radius
    return abs(round(distance, 4))

def add_distance_to_centroid(centroid_df, stop_df, drop_centroid_col=True):
    c_df = centroid_df.selectExpr("coordinates as c_coordinates", "busStop as c_busStop")
    left_join = stop_df.join(c_df, stop_df['busStop'] == c_df['c_busStop'], how='inner')
    res = left_join.withColumn('distance', get_distance(left_join.c_coordinates, left_join.coordinates)).drop('c_busStop')
    if drop_centroid_col:
        return res.drop('c_coordinates')
    return res

# Where am I?

In [3]:

@F.udf('long')
def get_text_distance(station_name, reverse_gecode):
    lev_dist = 2**10
    # lng, lat = coords
    # params = {'latlng' : f"{lat}, {lng}", 'key' : API_KEY}
    # reverse_geocode_result = requests.get("https://maps.googleapis.com/maps/api/geocode/json", params=params).json()['results']
    # filtered = [res['formatted_address'] for res in reverse_geocode_result 
    #                         if any(elm in res['types'] for elm in['street_address', 'transit_station'])]
    if reverse_gecode:
        lev_dist = min([F.levenshtein(station_name, address)/max(len(station_name), men(address)) for address in reverse_gecode])
    return lev_dist


STOPWORDS = ['avenue', 'ave', 'blvd', 'boulevard', 'box', 'cir', 'court', 'ct', 'drive', 'dr', 'lane', 'ln', 'loop', 'lp', 'pl', 'place', 'po', 'pob', 'pt', 'rd', 'road', 'route', 'rr', 'rte', 'rural', 'sq', 'st', 'ste', 'street', 'suit', 'trl', 'way', 'wy']

def extract_address(result):
    address = []
    try:
        address = result['Location']['Address']['Street']
    except:
        address = result['Location']['Address']['Label']
    return ' '.join(filter(lambda word: word.lower().rstrip('.') not in STOPWORDS, address.split()))

@F.udf(ArrayType(StringType()))
def reverse_gecode(coords):
    lng, lat = coords
    params = {'prox' : f"{lat}, {lng}, 5", 'mode' : 'retrieveAddresses', 'apiKey' : HERE_API_KEY}
    results = requests.get("https://reverse.geocoder.ls.hereapi.com/6.2/reversegeocode.json", params=params)\
                                                                                    .json()['Response']['View'][0]['Result']
    addresses = set(map(extract_address, results))
    
    return addresses


@F.udf(BooleanType())
def is_approx_near(coords_a, coords_b, decimal=5):
    lng_a, lat_a = coords_a
    lng_b, lat_b = coords_b

    return (round(lng_a, decimal) == round(lng_b, decimal)) and (round(lat_a, decimal) == round(lat_b, decimal))

In [4]:
agg_stop_df = read_elastic('agg1-coords-index')
agg_stop_df.head(5)

[Row(coordinates=[-6.2623, 53.35344]),
 Row(coordinates=[-6.2623, 53.3536]),
 Row(coordinates=[-6.2623, 53.35357]),
 Row(coordinates=[-6.2623, 53.35362]),
 Row(coordinates=[-6.2623, 53.35358])]

In [5]:
agg_stop_df = read_elastic('agg1-coords-index', scroll_size="100")\
                .withColumn('reverse_gecode', reverse_gecode(F.col('coordinates')))
                # .withColumnRenamed('coordinates', 'agg_coords')
settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {    
            "coordinates" : { "type": "geo_point" },
            "reverse_gecode" : { "type": "keyword" }
        }
    }
}
# Was "reverse-gecode-index"
write_to_elastic(agg_stop_df, 'agg1-street-index', settings= settings, append= False)

In [0]:
agg_stop_df.head(5)

In [0]:
agg_stop_df

In [0]:
agg_stop_df =  read_elastic("agg1-street-index", array_field="reverse_gecode")\
                .withColumnRenamed('coordinates', 'agg_coords')
                # .withColumn('reverse_gecode', F.array_distinct("reverse_gecode"))

stop_df = read_elastic('stop-index')




reverse_gecode_df = stop_df.join(agg_stop_df, 
                                    (F.round(F.element_at(stop_df.coordinates, 1), 5) == F.round(F.element_at(agg_stop_df.agg_coords, 1), 5)) &
                                    (F.round(F.element_at(stop_df.coordinates, 2), 5) == F.round(F.element_at(agg_stop_df.agg_coords, 2), 5)),
                                    how='left').drop('agg_coords')

settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "actualDelay" : { "type": "long" },
            "areaId" : { "type": "long" },
            "areaId1" : { "type": "long" },
            "areaId2" : { "type": "long" },
            "areaId3" : { "type": "long" },
            "atStop" : { "type": "boolean" },
            "busStop" : { "type": "long" },
            "congestion" : { "type": "boolean" },
            "gridID" : { "type": "keyword" },
            "journeyPatternId" : { "type": "keyword" },
            "lineId" : { "type": "keyword" },
            "coordinates" : { "type": "geo_point" },
            "timestamp" : { "type": "date", "format" : "epoch_millis" },
            "vehicleId" : { "type": "long" },
            "dateTime" : { "type": "date" },
            "reverse_gecode" : { "type": "keyword" }
        }
    }
}

write_to_elastic(reverse_gecode_df, 'reverse-gecode-index', settings=settings, append=False)

In [0]:
reverse_gecode_df = read_elastic('reverse-gecode-index', array_field="reverse_gecode")

stop_info_df = read_elastic('stop-information-index').select('stopid', 'shortname')

joined_df = reverse_gecode_df\
                .join(stop_info_df, reverse_gecode_df['busStop'] == stop_info_df['stopid'], how='inner')\
                .withColumn('lev_distance', get_text_distance(F.col('shortname'), F.col('reverse_gecode')))\
                .drop('stopid')



In [0]:
joined_df

In [0]:
settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "actualDelay" : { "type": "long" },
            "areaId" : { "type": "long" },
            "areaId1" : { "type": "long" },
            "areaId2" : { "type": "long" },
            "areaId3" : { "type": "long" },
            "atStop" : { "type": "boolean" },
            "busStop" : { "type": "long" },
            "congestion" : { "type": "boolean" },
            "gridID" : { "type": "keyword" },
            "journeyPatternId" : { "type": "keyword" },
            "lineId" : { "type": "keyword" },
            "coordinates" : { "type": "geo_point" },
            "timestamp" : { "type": "date", "format" : "epoch_millis" },
            "vehicleId" : { "type": "long" },
            "dateTime" : { "type": "date" },
            "shortname" : { "type" : "keyword" },
            "reverse_gecode" : { "type": "keyword" },
            "lev_dist" : { "type": "long" }
        }
    }
}

write_to_elastic(joined_df, 'lev-dist-index', settings=settings, append= False)

## Filter by Levenshtien:

In [0]:
stop_df = read_elastic('lev-dist-index', array_field="reverse_gecode")

filter_stop = stop_df.filter("lev_dist < 0.5").drop('lev_dist')

settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "actualDelay" : { "type": "long" },
            "areaId" : { "type": "long" },
            "areaId1" : { "type": "long" },
            "areaId2" : { "type": "long" },
            "areaId3" : { "type": "long" },
            "atStop" : { "type": "boolean" },
            "busStop" : { "type": "long" },
            "congestion" : { "type": "boolean" },
            "gridID" : { "type": "keyword" },
            "journeyPatternId" : { "type": "keyword" },
            "lineId" : { "type": "keyword" },
            "coordinates" : { "type": "geo_point" },
            "timestamp" : { "type": "date", "format" : "epoch_millis" },
            "vehicleId" : { "type": "long" },
            "dateTime" : { "type": "date" },
            "shortname" : { "type" : "keyword" },
            "reverse_gecode" : { "type": "keyword" }
        }
    }
}

write_to_elastic(filter_stop, 'filter-lev-dist-index', settings=settings, append=False)


## Calculate Centroids:

In [0]:
filtered_df = read_elastic('filter-lev-dist-index')

filtered_df_centroids = calculate_centroids(filtered_df.select("busStop", "coordinates"))


settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "busStop" : { "type": "long" },
            "coordinates" : { "type": "geo_point" },
        }
    }
}

write_to_elastic(filtered_df_centroids, index="filter-lev-dist-centroid-index", settings=settings, append=False)

In [0]:
stop_centroid_df = read_elastic("filter-lev-dist-centroid-index")
true_coord_df = read_elastic('true-centroid-index')

eval_df = true_coord_df.withColumnRenamed('coordinates', 'true_coordinates')
eval_df = eval_df.join(stop_centroid_df, on='busStop', how='inner').withColumnRenamed('coordinates', 'centroid_coordinates')

mse_centroid = eval_df.agg(F.mean(F.pow(get_distance(eval_df.true_coordinates, eval_df.centroid_coordinates), 2)).alias('mse')).collect()[0]['mse']

print(f"The MSE Value for Centroid is {mse_centroid}")

In [0]:
# https://api.bigdatacloud.net/data/reverse-geocode-client?latitude=37.42159&longitude=-122.0837&localityLanguage=en
params = {'latitude' : '53.367157', 'longitude':  '-6.255481', 'localityLanguage' : 'en'}
res = requests.get("https://api.bigdatacloud.net/data/reverse-geocode-client", params=params).json()
res

In [11]:
	# https://reverse.geocoder.ls.hereapi.com/6.2/reversegeocode.{format}
import requests
params = {'prox' : '53.367157, -6.255481, 5', 'mode' : 'retrieveAddresses', 'apiKey' : 'yGTRTamrlVWaKOflMhHrFYUGwPIDldx9YciJie0m_xE'}
res = requests.get("https://reverse.geocoder.ls.hereapi.com/6.2/reversegeocode.json", params=params).json()['Response']['View'][0]['Result']
res

[{'Relevance': 1.0,
  'Distance': 2.1,
  'MatchLevel': 'houseNumber',
  'MatchQuality': {'Country': 1.0,
   'County': 1.0,
   'City': 1.0,
   'District': 1.0,
   'Street': [1.0],
   'HouseNumber': 1.0,
   'PostalCode': 1.0},
  'MatchType': 'interpolated',
  'Location': {'LocationId': 'NT_2.7cJESPpQprlm8DnwqrIC_xMTO',
   'LocationType': 'point',
   'DisplayPosition': {'Latitude': 53.36716, 'Longitude': -6.25545},
   'MapView': {'TopLeft': {'Latitude': 53.3682842, 'Longitude': -6.257334},
    'BottomRight': {'Latitude': 53.3660358, 'Longitude': -6.253566}},
   'Address': {'Label': '139 Drumcondra Road Lower, Dublin, County Dublin, D09 FN44, Ireland',
    'Country': 'IRL',
    'County': 'County Dublin',
    'City': 'Dublin',
    'District': 'Dublin 9',
    'Street': 'Drumcondra Road Lower',
    'HouseNumber': '139',
    'PostalCode': 'D09 FN44',
    'AdditionalData': [{'value': 'Ireland', 'key': 'CountryName'},
     {'value': 'County Dublin', 'key': 'CountyName'}]},
   'MapReference': {'R

In [15]:
res[0]['Location']['Address']

{'Label': '139 Drumcondra Road Lower, Dublin, County Dublin, D09 FN44, Ireland',
 'Country': 'IRL',
 'County': 'County Dublin',
 'City': 'Dublin',
 'District': 'Dublin 9',
 'Street': 'Drumcondra Road Lower',
 'HouseNumber': '139',
 'PostalCode': 'D09 FN44',
 'AdditionalData': [{'value': 'Ireland', 'key': 'CountryName'},
  {'value': 'County Dublin', 'key': 'CountyName'}]}

In [0]:
"st.a".rstrip('.')

In [0]:
text = _

In [0]:
stopwords = ['avenue', 'ave', 'blvd', 'boulevard', 'box', 'cir', 'court', 'ct', 'drive', 'dr', 'lane', 'ln', 'loop', 'lp', 'pl', 'place', 'po', 'pob', 'pt', 'rd', 'road', 'route', 'rr', 'rte', 'rural', 'sq', 'st', 'ste', 'street', 'suit', 'trl', 'way', 'wy']
' '.join(filter(lambda word: word.lower().rstrip('.') not in stopwords, text.split()))

In [0]:
text.lower().replace('road', '')

In [0]:
params = {'latlng' : '53.367157, -6.255481', 'key' : API_KEY}
res = requests.get("https://maps.googleapis.com/maps/api/geocode/json", params=params).json()['results']
res

In [0]:
reverse_geocode_result = res

In [0]:
filtered = [(res['formatted_address']) for res in reverse_geocode_result if any(elm in res['types'] for elm in['street_address', 'transit_station'])]

In [0]:
filtered

In [0]:
es.index(
    index = 'agg-coords-index',
    doc_type = '_doc',
    body = {"coordinates" : [-6.255481, 53.367157]}
)