In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [4]:
spark = SparkSession.builder \
    .master("local[*]")\
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

In [8]:
import s2sphere
from pyspark.sql.types import StringType, FloatType

def cell_id(level: int, lat: int, lng: int) -> str:
    return s2sphere.CellId.from_lat_lng(s2sphere.LatLng.from_degrees(lat, lng)).parent(level).to_token()

cell_id_udf = f.udf(cell_id, StringType())

def sphere_distance(token_from: str, token_to: str) -> float:
    r = 6373.0
    cell_from = s2sphere.CellId.from_token(token_from)
    cell_to = s2sphere.CellId.from_token(token_to)
    return cell_from.to_lat_lng().get_distance(cell_to.to_lat_lng()).radians * r

sphere_distance_udf = f.udf(sphere_distance, FloatType())

train_df = spark.read.option('header', 'true').csv('../data/raw/train.csv')\
.select('pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude')\
.withColumnRenamed('pickup_latitude', 'pickup_lat')\
.withColumn('pickup_lat', f.col('pickup_lat').cast('double'))\
.withColumnRenamed('pickup_longitude', 'pickup_lon')\
.withColumn('pickup_lon', f.col('pickup_lon').cast('double'))\
.withColumnRenamed('dropoff_latitude', 'dropoff_lat')\
.withColumn('dropoff_lat', f.col('dropoff_lat').cast('double'))\
.withColumnRenamed('dropoff_longitude', 'dropoff_lon')\
.withColumn('dropoff_lon', f.col('dropoff_lon').cast('double'))\
.withColumn('pickup_cell', cell_id_udf(f.lit(18), f.col('pickup_lat'), f.col('pickup_lon')))\
.withColumn('dropoff_cell', cell_id_udf(f.lit(18), f.col('dropoff_lat'), f.col('dropoff_lon')))

test_df = spark.read.option('header', 'true').csv('../data/raw/test.csv')\
.select('pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude')\
.withColumnRenamed('pickup_latitude', 'pickup_lat')\
.withColumn('pickup_lat', f.col('pickup_lat').cast('double'))\
.withColumnRenamed('pickup_longitude', 'pickup_lon')\
.withColumn('pickup_lon', f.col('pickup_lon').cast('double'))\
.withColumnRenamed('dropoff_latitude', 'dropoff_lat')\
.withColumn('dropoff_lat', f.col('dropoff_lat').cast('double'))\
.withColumnRenamed('dropoff_longitude', 'dropoff_lon')\
.withColumn('dropoff_lon', f.col('dropoff_lon').cast('double'))\
.withColumn('pickup_cell', cell_id_udf(f.lit(18), f.col('pickup_lat'), f.col('pickup_lon')))\
.withColumn('dropoff_cell', cell_id_udf(f.lit(18), f.col('dropoff_lat'), f.col('dropoff_lon')))

df = train_df.union(test_df)\
.select('pickup_cell', 'dropoff_cell')\
.dropDuplicates()\
.withColumn('distance', sphere_distance_udf(f.col('pickup_cell'), f.col('dropoff_cell')))

#df.write.parquet('../data/processed/distance_matrix', mode='overwrite')


In [9]:
df.show(3)
df.printSchema()
df.count()

+-----------+------------+---------+
|pickup_cell|dropoff_cell| distance|
+-----------+------------+---------+
| 89c259ae67|  89c258ec75|3.0498326|
| 89c259bd67|  89c258596f| 3.545008|
| 89c2599283|  89c2585891|4.7375374|
+-----------+------------+---------+
only showing top 3 rows

root
 |-- pickup_cell: string (nullable = true)
 |-- dropoff_cell: string (nullable = true)
 |-- distance: float (nullable = true)



2000986