In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ | done
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l- \ | / - \ | / - \ | / - done
[?25h  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317146 sha256=50b2ff7fb903492d06338db595fc43c1ea428212bb015624e7f3ef9e3a170f90
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0
[0m

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext

# Spark conf object

#conf = SparkConf().setAppName("Geohash")

# Create Spark context object

#sc = SparkContext(conf=conf)

spark = SparkSession.builder.appName("Geohash").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/16 10:57:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
from pyspark.sql.functions import concat, lit 

In [5]:
df = spark.createDataFrame([(1,40.7128,-74.0060),(2,37.7749, -122.4194)],
                          ['ID','Lat','Long'])

In [6]:
# Conactenate lat, long column 
df = df.withColumn("Geom",concat(lit("("),df.Lat,lit(", "),df.Long,lit(")")))

In [7]:
df.show()

                                                                                

+---+-------+---------+--------------------+
| ID|    Lat|     Long|                Geom|
+---+-------+---------+--------------------+
|  1|40.7128|  -74.006|  (40.7128, -74.006)|
|  2|37.7749|-122.4194|(37.7749, -122.4194)|
+---+-------+---------+--------------------+



In [8]:
!pip install pygeohash

Collecting pygeohash
  Downloading pygeohash-1.2.0.tar.gz (5.0 kB)
  Preparing metadata (setup.py) ... [?25l- \ done
[?25hBuilding wheels for collected packages: pygeohash
  Building wheel for pygeohash (setup.py) ... [?25l- \ | done
[?25h  Created wheel for pygeohash: filename=pygeohash-1.2.0-py2.py3-none-any.whl size=6168 sha256=6fbafdda7ed035f06cf9894c80b4c299bf2e3e31c7276e0e9b9c049bb308df33
  Stored in directory: /root/.cache/pip/wheels/28/ec/b6/beadf7295a623f528507691fb0d471b50d064ae9bbad420b8f
Successfully built pygeohash
Installing collected packages: pygeohash
Successfully installed pygeohash-1.2.0
[0m

# ENCODE GEOHASH

In [9]:
import pygeohash as pgh
import pyspark.sql.functions as F 

geohash_udf = F.udf(lambda x,y: pgh.encode(x,y,precision=7))
df = df.select("ID","Geom","Lat",'Long',geohash_udf('Lat','Long').alias('encoded_val7'))

In [10]:
df.show()

                                                                                

+---+--------------------+-------+---------+------------+
| ID|                Geom|    Lat|     Long|encoded_val7|
+---+--------------------+-------+---------+------------+
|  1|  (40.7128, -74.006)|40.7128|  -74.006|     dr5regw|
|  2|(37.7749, -122.4194)|37.7749|-122.4194|     9q8yyk8|
+---+--------------------+-------+---------+------------+



                                                                                

# DECODE GEOHASH

In [11]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType,ArrayType
import pygeohash as pgh 

In [12]:
udf2 = F.udf(lambda x: pgh.decode(x),ArrayType(FloatType()))

In [13]:
df_new = df.select('ID','Geom','Lat','Long','encoded_val7',udf2('encoded_val7').alias('decodedVal'))

In [14]:
df_new.show()

+---+--------------------+-------+---------+------------+----------------+
| ID|                Geom|    Lat|     Long|encoded_val7|      decodedVal|
+---+--------------------+-------+---------+------------+----------------+
|  1|  (40.7128, -74.006)|40.7128|  -74.006|     dr5regw| [40.71, -74.01]|
|  2|(37.7749, -122.4194)|37.7749|-122.4194|     9q8yyk8|[37.77, -122.42]|
+---+--------------------+-------+---------+------------+----------------+



In [15]:
df_new.columns

['ID', 'Geom', 'Lat', 'Long', 'encoded_val7', 'decodedVal']

- As the decoded string is a “List”, it needs to be split into separate Lat/Long columns

In [16]:
from pyspark.sql.functions import split
from pyspark.sql.functions import col
# split the decoded value into latitude and longitude columns
df_new = df_new.withColumn("latitude", col("decodedVal")[0].cast("float"))
df_new = df_new.withColumn("longitude", col("decodedVal")[1].cast("float"))


In [17]:
df_new.show()

+---+--------------------+-------+---------+------------+----------------+--------+---------+
| ID|                Geom|    Lat|     Long|encoded_val7|      decodedVal|latitude|longitude|
+---+--------------------+-------+---------+------------+----------------+--------+---------+
|  1|  (40.7128, -74.006)|40.7128|  -74.006|     dr5regw| [40.71, -74.01]|   40.71|   -74.01|
|  2|(37.7749, -122.4194)|37.7749|-122.4194|     9q8yyk8|[37.77, -122.42]|   37.77|  -122.42|
+---+--------------------+-------+---------+------------+----------------+--------+---------+

