In [65]:
import pyspark
from pyspark.sql import SparkSession
import gdown
import os
from pyspark.sql.functions import *

In [4]:
source_url = 'https://drive.google.com/uc?id=1vcb_HBWsOSKW4XxhLfRpGlLzBLwHlGWJ'
output_path = 'source_data/multi_source_data'
output_file = os.path.join(output_path, 'multi_source_demo.zip') 

gdown.download(source_url, output_file, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1vcb_HBWsOSKW4XxhLfRpGlLzBLwHlGWJ
From (redirected): https://drive.google.com/uc?id=1vcb_HBWsOSKW4XxhLfRpGlLzBLwHlGWJ&confirm=t&uuid=089e7409-dbfd-43d6-ae68-bdcdc16b18c5
To: /home/datatalks_jan/Data_Eden/8_pySpark_pilot/source_data/multi_source_data/multi_source_demo.zip
100%|█████████████████████████████████████████████████████████████| 154M/154M [00:00<00:00, 229MB/s]


'source_data/multi_source_data/multi_source_demo.zip'

In [5]:
os.system(f'unzip -o {output_file} -d {output_path}')

Archive:  source_data/multi_source_data/multi_source_demo.zip
  inflating: source_data/multi_source_data/distribution_centers.csv  
  inflating: source_data/multi_source_data/events.csv  
  inflating: source_data/multi_source_data/inventory_items.csv  
  inflating: source_data/multi_source_data/order_items.csv  
  inflating: source_data/multi_source_data/orders.csv  
  inflating: source_data/multi_source_data/products.csv  
  inflating: source_data/multi_source_data/readme.txt  
  inflating: source_data/multi_source_data/users.csv  


0

In [6]:
spark = SparkSession.builder.master("local[*]").appName('conrad_test').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/03 10:57:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [50]:
df_1_users = spark.read.option("header", "true").csv(f'{output_path}/users.csv')
df_1_distri_centers = spark.read.option("header", "true").csv(f'{output_path}/distribution_centers.csv')

                                                                                

In [52]:
df_1_users = df_1_users.withColumn("latitude", df_1_users["latitude"].cast("float"))
df_1_users = df_1_users.withColumn("longitude", df_1_users["longitude"].cast("float"))
df_1_distri_centers = df_1_distri_centers.withColumn("latitude", df_1_distri_centers["latitude"].cast("float"))
df_1_distri_centers = df_1_distri_centers.withColumn("longitude", df_1_distri_centers["longitude"].cast("float"))

In [67]:
df_1_distri_centers.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)



In [110]:
from pyspark.sql.functions import col, acos, cos, sin, radians, atan2, sqrt

In [111]:
def haversine_distance(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = 6371 * c 
    return distance

In [135]:
df_1_users.createOrReplaceTempView("users")
df_1_distri_centers.createOrReplaceTempView("distribution_centers")

In [136]:
df_1_sql_query = """
    SELECT u.id as user_id, u.age as user_age, u.country, u.state,
           dc.id AS distribution_center_id,dc.name AS distribution_center_name,
           u.latitude as user_latitude, u.longitude as user_longitude, dc.latitude as center_latitude, dc.longitude as center_longitude
    FROM users u
    CROSS JOIN distribution_centers dc
    ORDER BY u.id
"""

In [146]:
df_1_with_distance = spark.sql(df_1_sql_query).withColumn("distance", haversine_distance(col("user_latitude"), col("user_longitude"), col("center_latitude"), col("center_longitude")))
df_1_with_distance.show()



+-------+--------+-------+---------+----------------------+------------------------+-------------+--------------+---------------+----------------+------------------+
|user_id|user_age|country|    state|distribution_center_id|distribution_center_name|user_latitude|user_longitude|center_latitude|center_longitude|          distance|
+-------+--------+-------+---------+----------------------+------------------------+-------------+--------------+---------------+----------------+------------------+
|      1|      70|  China|   Shanxi|                     1|              Memphis TN|    36.147415|      113.1227|        35.1174|        -89.9711|11737.842558916114|
|      1|      70|  China|   Shanxi|                     2|              Chicago IL|    36.147415|      113.1227|        41.8369|        -87.6847|11089.054948665944|
|      1|      70|  China|   Shanxi|                     3|              Houston TX|    36.147415|      113.1227|        29.7604|        -95.3698|12104.974024220499|
|   

                                                                                

In [150]:
from pyspark.sql.window import Window
windowSpec = Window.partitionBy("user_id").orderBy("distance")
df_with_row_number = df_1_with_distance.withColumn("row_number", row_number().over(windowSpec))
df_1_final = df_with_row_number.filter(col("row_number") == 1)
df_1 = df_1_final.select("user_id","user_age","country","state","distribution_center_id","distribution_center_name","distance").orderBy("user_id")
df_1.show()
df_1.printSchema()

[Stage 108:>                                                        (0 + 2) / 2]

+-------+--------+--------------+--------------------+----------------------+------------------------+------------------+
|user_id|user_age|       country|               state|distribution_center_id|distribution_center_name|          distance|
+-------+--------+--------------+--------------------+----------------------+------------------------+------------------+
|      1|      70|         China|              Shanxi|                     4|          Los Angeles CA|10564.978611447585|
|     10|      18|         China|           Guangdong|                     4|          Los Angeles CA|11683.981878339073|
|    100|      32|         China|           Guangdong|                     4|          Los Angeles CA|11433.559166566682|
|   1000|      49|         China|           Guangdong|                     4|          Los Angeles CA|11628.109441907302|
|  10000|      38|         China|           Guangdong|                     4|          Los Angeles CA|11375.162419660232|
| 100000|      34|      

[Stage 111:>                                                        (0 + 2) / 2]                                                                                