In [1]:
import pyspark
from pyspark.sql import SparkSession
import gdown
import os

In [4]:
source_url = 'https://drive.google.com/uc?id=1vcb_HBWsOSKW4XxhLfRpGlLzBLwHlGWJ'
output_path = 'source_data/multi_source_data'
output_file = os.path.join(output_path, 'multi_source_demo.zip') 

gdown.download(source_url, output_file, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1vcb_HBWsOSKW4XxhLfRpGlLzBLwHlGWJ
From (redirected): https://drive.google.com/uc?id=1vcb_HBWsOSKW4XxhLfRpGlLzBLwHlGWJ&confirm=t&uuid=0ec89e14-43e6-41f6-bfeb-271900c9f504
To: /home/datatalks_jan/Data_Eden/8_pySpark_pilot/source_data/multi_source_data/multi_source_demo.zip
100%|█████████████████████████████████████████████████████████████| 154M/154M [00:00<00:00, 187MB/s]


'source_data/multi_source_data/multi_source_demo.zip'

In [5]:
os.system(f'unzip -o {output_file} -d {output_path}')

Archive:  source_data/multi_source_data/multi_source_demo.zip
  inflating: source_data/multi_source_data/distribution_centers.csv  
  inflating: source_data/multi_source_data/events.csv  
  inflating: source_data/multi_source_data/inventory_items.csv  
  inflating: source_data/multi_source_data/order_items.csv  
  inflating: source_data/multi_source_data/orders.csv  
  inflating: source_data/multi_source_data/products.csv  
  inflating: source_data/multi_source_data/readme.txt  
  inflating: source_data/multi_source_data/users.csv  


0

In [6]:
spark = SparkSession.builder.master("local[*]").appName('conrad_test').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/03 10:50:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
df_1_users = spark.read.option("header", "true").csv(f'{output_path}/users.csv')
df_1_distri_centers = spark.read.option("header", "true").csv(f'{output_path}/distribution_centers.csv')

                                                                                

In [8]:
from pyspark.sql.functions import col, acos, cos, sin, radians, atan2, sqrt

In [13]:
def haversine_distance(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = 6371 * c 
    return distance

In [14]:
spark.udf.register("haversine_distance", haversine_distance) # for spark SQL

<function __main__.haversine_distance(lat1, lon1, lat2, lon2)>

In [15]:
df_1_users.createOrReplaceTempView("users")
df_1_distri_centers.createOrReplaceTempView("distribution_centers")

In [16]:
df_1_sql_query = """
    SELECT u.id, u.age, u.country, u.state,
           dc.id AS distribution_center_id,
           haversine_distance(u.latitude, u.longitude, dc.latitude, dc.longitude) AS distance
    FROM users u
    CROSS JOIN distribution_centers dc
    ORDER BY u.id, distance
"""

In [19]:
nearest_centers_df = spark.sql(df_1_sql_query)