## Data Setup and Loading
First, let's load taxi trip data and examine its structure.<br />
Source: Developing Applications with Apache Sparkâ„¢ - self-paced training Databricks

In [0]:
from pyspark.sql.functions import *

trips_df = spark.read.table("samples.nyctaxi.trips")
display(trips_df.limit(10))


tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,fare_amount,pickup_zip,dropoff_zip
2016-02-13T21:47:53.000Z,2016-02-13T21:57:15.000Z,1.4,8.0,10103,10110
2016-02-13T18:29:09.000Z,2016-02-13T18:37:23.000Z,1.31,7.5,10023,10023
2016-02-06T19:40:58.000Z,2016-02-06T19:52:32.000Z,1.8,9.5,10001,10018
2016-02-12T19:06:43.000Z,2016-02-12T19:20:54.000Z,2.3,11.5,10044,10111
2016-02-23T10:27:56.000Z,2016-02-23T10:58:33.000Z,2.6,18.5,10199,10022
2016-02-13T00:41:43.000Z,2016-02-13T00:46:52.000Z,1.4,6.5,10023,10069
2016-02-18T23:49:53.000Z,2016-02-19T00:12:53.000Z,10.4,31.0,11371,10003
2016-02-18T20:21:45.000Z,2016-02-18T20:38:23.000Z,10.15,28.5,11371,11201
2016-02-03T10:47:50.000Z,2016-02-03T11:07:06.000Z,3.27,15.0,10014,10023
2016-02-19T01:26:39.000Z,2016-02-19T01:40:01.000Z,4.42,15.0,10003,11222


In [0]:
#count trips by pickup location
location_counts = trips_df\
    .groupBy("pickup_zip")\
    .count()\
    .orderBy(desc("count"))

display(location_counts.limit(10))

pickup_zip,count
10001,1227
10003,1181
10011,1129
10021,1021
10018,1012
10023,1008
10028,929
10012,834
10110,763
10065,702


In [0]:
# Perfom multiple aggregations by location, order by most popular pickup location

locations_stats = trips_df\
    .groupBy("pickup_zip")\
    .agg(
        count("*").alias("total_trips"), 
        round(avg("trip_distance")).alias("avg_distance"), 
        round(avg("fare_amount")).alias("avg_fare"), 
        round(sum("fare_amount")).alias("total_fare")
        )\
    .orderBy(desc("total_trips"))

display(location_counts.limit(10))

pickup_zip,count
10001,1227
10003,1181
10011,1129
10021,1021
10018,1012
10023,1008
10028,929
10012,834
10110,763
10065,702


In [0]:
from pyspark.sql.window import Window

# Create Window Specs for different ranking methods
window_by_trips = Window.partitionBy("pickup_zip").orderBy(desc("total_trips"))
window_by_fare = Window.partitionBy("pickup_zip").orderBy(desc("avg_fare"))

# Add different types of rankings
ranked_locations = locations_stats\
    .withColumn("rank_trips", rank().over(window_by_trips))\
    .withColumn("rank_fare", rank().over(window_by_fare))\
    .withColumn("fare_quintile", ntile(5).over(window_by_fare))

display(ranked_locations.limit(10))


pickup_zip,total_trips,avg_distance,avg_fare,total_fare,rank_trips,rank_fare,fare_quintile
7002,2,2.0,11.0,21.0,1,1,1
7030,1,6.0,40.0,40.0,1,1,1
7086,1,0.0,40.0,40.0,1,1,1
7087,1,9.0,31.0,31.0,1,1,1
7114,1,0.0,105.0,105.0,1,1,1
7310,1,0.0,105.0,105.0,1,1,1
7311,1,2.0,60.0,60.0,1,1,1
7718,1,1.0,5.0,5.0,1,1,1
7737,1,3.0,16.0,16.0,1,1,1
7974,1,0.0,188.0,188.0,1,1,1


In [0]:
# Display the results

display(ranked_locations.select("pickup_zip",\
    "total_trips",  "avg_fare", "avg_distance", \
        "rank_trips", "rank_fare", "fare_quintile"
    ).limit(100)
)

pickup_zip,total_trips,avg_fare,avg_distance,rank_trips,rank_fare,fare_quintile
7002,2,11.0,2.0,1,1,1
7030,1,40.0,6.0,1,1,1
7086,1,40.0,0.0,1,1,1
7087,1,31.0,9.0,1,1,1
7114,1,105.0,0.0,1,1,1
7310,1,105.0,0.0,1,1,1
7311,1,60.0,2.0,1,1,1
7718,1,5.0,1.0,1,1,1
7737,1,16.0,3.0,1,1,1
7974,1,188.0,0.0,1,1,1


## Relational Operations
First, it will load same sample of retail data tables

In [0]:
transactions_df = spark.read.table("samples.bakehouse.sales_transactions")
customers_df = spark.read.table("samples.bakehouse.sales_customers")
franchises_df = spark.read.table("samples.bakehouse.sales_franchises")
supplies_df = spark.read.table("samples.bakehouse.sales_suppliers")


In [0]:
transactions_df.printSchema()

root
 |-- transactionID: long (nullable = true)
 |-- customerID: long (nullable = true)
 |-- franchiseID: long (nullable = true)
 |-- dateTime: timestamp (nullable = true)
 |-- product: string (nullable = true)
 |-- quantity: long (nullable = true)
 |-- unitPrice: long (nullable = true)
 |-- totalPrice: long (nullable = true)
 |-- paymentMethod: string (nullable = true)
 |-- cardNumber: long (nullable = true)



In [0]:
customers_df.printSchema()

root
 |-- customerID: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email_address: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- postal_zip_code: long (nullable = true)
 |-- gender: string (nullable = true)



In [0]:
franchises_df.printSchema()

root
 |-- franchiseID: long (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- district: string (nullable = true)
 |-- zipcode: string (nullable = true)
 |-- country: string (nullable = true)
 |-- size: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- supplierID: long (nullable = true)



In [0]:
supplies_df.printSchema()

root
 |-- supplierID: long (nullable = true)
 |-- name: string (nullable = true)
 |-- ingredient: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- city: string (nullable = true)
 |-- district: string (nullable = true)
 |-- size: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- approved: string (nullable = true)



## Basic Join Operations

In [0]:
# inner join to enrich the transactions with store info

enrichd_transactions = franchises_df.join(
    transactions_df, 
    on="franchiseID", # on=transactions_df.franchiseID == franchises_df.franchiseID, - it will result in the same condition
    how="inner")

display(enrichd_transactions.limit(10))

franchiseID,name,city,district,zipcode,country,size,longitude,latitude,supplierID,transactionID,customerID,dateTime,product,quantity,unitPrice,totalPrice,paymentMethod,cardNumber
3000047,Sweet Sinsations,Stockholm,Sodermalm,116 45,Sweden,S,18.072,59.3144,4000047,1002961,2000253,2024-05-14T12:17:01.495Z,Golden Gate Ginger,8,3,24,amex,378154478982993
3000047,Sweet Sinsations,Stockholm,Sodermalm,116 45,Sweden,S,18.072,59.3144,4000047,1003007,2000226,2024-05-10T23:10:10.239Z,Austin Almond Biscotti,36,3,108,mastercard,2244626981238094
3000047,Sweet Sinsations,Stockholm,Sodermalm,116 45,Sweden,S,18.072,59.3144,4000047,1003017,2000108,2024-05-16T16:34:10.613Z,Austin Almond Biscotti,40,3,120,mastercard,2490570234487424
3000047,Sweet Sinsations,Stockholm,Sodermalm,116 45,Sweden,S,18.072,59.3144,4000047,1003068,2000173,2024-05-02T04:31:51.612Z,Pearly Pies,28,3,84,amex,343808569426192
3000047,Sweet Sinsations,Stockholm,Sodermalm,116 45,Sweden,S,18.072,59.3144,4000047,1003103,2000075,2024-05-04T23:44:26.902Z,Pearly Pies,28,3,84,visa,4377080942201798
3000047,Sweet Sinsations,Stockholm,Sodermalm,116 45,Sweden,S,18.072,59.3144,4000047,1003147,2000295,2024-05-15T16:17:06.259Z,Austin Almond Biscotti,32,3,96,amex,371093774812677
3000047,Sweet Sinsations,Stockholm,Sodermalm,116 45,Sweden,S,18.072,59.3144,4000047,1003196,2000237,2024-05-07T11:13:22.469Z,Tokyo Tidbits,40,3,120,mastercard,5538807345848392
3000047,Sweet Sinsations,Stockholm,Sodermalm,116 45,Sweden,S,18.072,59.3144,4000047,1003329,2000272,2024-05-06T03:32:16.017Z,Outback Oatmeal,28,3,84,visa,4872480716880043
3000047,Sweet Sinsations,Stockholm,Sodermalm,116 45,Sweden,S,18.072,59.3144,4000047,1001264,2000209,2024-05-16T17:32:28.547Z,Pearly Pies,28,3,84,mastercard,5287105980593305
3000047,Sweet Sinsations,Stockholm,Sodermalm,116 45,Sweden,S,18.072,59.3144,4000047,1001287,2000120,2024-05-15T08:41:28.406Z,Austin Almond Biscotti,40,3,120,amex,376211012259783


In [0]:
# it will alias some columns to disambigue columns names

enrichd_transactions = franchises_df \
    .select("franchiseID",
            col("name").alias("store_name"),
            col("city").alias("store_city"),
            col("country").alias("store_country")
            )\
    .join(transactions_df, 
          on="franchiseID",
          how="inner"
    )

display(enrichd_transactions.limit(10))

franchiseID,store_name,store_city,store_country,transactionID,customerID,dateTime,product,quantity,unitPrice,totalPrice,paymentMethod,cardNumber
3000047,Sweet Sinsations,Stockholm,Sweden,1002961,2000253,2024-05-14T12:17:01.495Z,Golden Gate Ginger,8,3,24,amex,378154478982993
3000047,Sweet Sinsations,Stockholm,Sweden,1003007,2000226,2024-05-10T23:10:10.239Z,Austin Almond Biscotti,36,3,108,mastercard,2244626981238094
3000047,Sweet Sinsations,Stockholm,Sweden,1003017,2000108,2024-05-16T16:34:10.613Z,Austin Almond Biscotti,40,3,120,mastercard,2490570234487424
3000047,Sweet Sinsations,Stockholm,Sweden,1003068,2000173,2024-05-02T04:31:51.612Z,Pearly Pies,28,3,84,amex,343808569426192
3000047,Sweet Sinsations,Stockholm,Sweden,1003103,2000075,2024-05-04T23:44:26.902Z,Pearly Pies,28,3,84,visa,4377080942201798
3000047,Sweet Sinsations,Stockholm,Sweden,1003147,2000295,2024-05-15T16:17:06.259Z,Austin Almond Biscotti,32,3,96,amex,371093774812677
3000047,Sweet Sinsations,Stockholm,Sweden,1003196,2000237,2024-05-07T11:13:22.469Z,Tokyo Tidbits,40,3,120,mastercard,5538807345848392
3000047,Sweet Sinsations,Stockholm,Sweden,1003329,2000272,2024-05-06T03:32:16.017Z,Outback Oatmeal,28,3,84,visa,4872480716880043
3000047,Sweet Sinsations,Stockholm,Sweden,1001264,2000209,2024-05-16T17:32:28.547Z,Pearly Pies,28,3,84,mastercard,5287105980593305
3000047,Sweet Sinsations,Stockholm,Sweden,1001287,2000120,2024-05-15T08:41:28.406Z,Austin Almond Biscotti,40,3,120,amex,376211012259783


In [0]:
# Analyze the relationshio between franchises and suppliers using a full outer join

full_join = franchises_df\
    .withColumnRenamed("name", "franchise_name")\
    .join(
        supplies_df.select("supplierID", col("name").alias("supplier_name")),
        on="supplierID",
        how="full_outer"
    )

# records that would NOT apper in an inner join
non_matching_records = full_join.filter(
    col("franchise_name").isNull() | 
    col("supplier_name").isNull()

)\
    .select("franchiseID", "franchise_name", col("supplierID").alias("orphaned_supplier_id"))

display(non_matching_records)


franchiseID,franchise_name,orphaned_supplier_id
3000047,Sweet Sinsations,4000047
3000040,Sweetie Pies,4000040
3000046,Baked Bliss,4000046
3000030,Caramel Cravings,4000030
3000038,Dough Delights,4000038
3000044,Sugar Rush,4000044
3000037,Kagoshima Confections,4000037
3000041,Okayama Ovens,4000041
3000043,Matsuyama Morsels,4000043
3000032,Sweet Temptations,4000032


## Using Spark SQL 

In [0]:
# Creates temp views
franchises_df.createOrReplaceTempView("franchises")
supplies_df.createOrReplaceTempView("supplies")

In [0]:
%sql
-- same outer join using SQL

SELECT
  f.franchiseID,
  f.name as franchise_name,
  s.supplierID as orphaned_supplier_id
FROM franchises as f

FULL OUTER JOIN supplies as s

ON f.supplierID = s.supplierID
WHERE f.franchiseID IS NULL OR s.supplierID IS NULL

franchiseID,franchise_name,orphaned_supplier_id
3000047,Sweet Sinsations,
3000040,Sweetie Pies,
3000046,Baked Bliss,
3000030,Caramel Cravings,
3000038,Dough Delights,
3000044,Sugar Rush,
3000037,Kagoshima Confections,
3000041,Okayama Ovens,
3000043,Matsuyama Morsels,
3000032,Sweet Temptations,


In [0]:
# Identify supplier IDs in each DataFrame

franchise_suppliers = franchises_df.select("supplierID").distinct()
all_suppliers = supplies_df.select("supplierID").distinct()

# Find the supplierIDs that are in the franchise_df but not in the supplies_df
franchise_without_valid_suppliers = franchise_suppliers.subtract(all_suppliers)
display(franchise_without_valid_suppliers)

supplierID
4000047
4000040
4000046
4000030
4000038
4000044
4000037
4000041
4000043
4000032


In [0]:
# Find the overlap - suppliers that exist in both DataFrames
valid_suppliers = franchise_suppliers.intersect(all_suppliers)
display(valid_suppliers)

supplierID
4000025
4000012
4000011
4000018
4000023
4000022
4000002
4000014
4000005
4000001


In [0]:
# Source: https://www.applegreenstores.com/fuel-prices/data.json
raw_gas_price_Applegreen = spark.read.json("/Volumes/csv_storage/default/csvs/2026_01_24.json")

display(raw_gas_price_Applegreen)

last_updated,stations
26/02/2025 11:45:37,"List(List(High Street, Uckfield, AppleGreen, List(50.9669, 0.096173), TN22 5DL, List(145.8, 138.8, 153.8, null), u101fnnp8yzb), List(23 Woodbridge Road East, Ipswich, AppleGreen, List(52.061264, 1.197083), IP4 5QN, List(141.8, 134.8, 149.8, null), u12b5jrwy51b), List(A10 Bypass, Ely, AppleGreen, List(52.462489, 0.28666), CB6 1SE, List(148.8, 140.8, 154.8, null), u124qw88psv0), List(Salop Road, Welshpool, AppleGreen, List(52.662139, -3.139685), SY21 7ET, List(145.8, null, null, null), gcmg8dcg0n1p), List(Crossways, Church Stretton, AppleGreen, List(52.537839, -2.802029), SY6 6PQ, List(148.8, 143.8, 158.8, null), gcq4bhpx5gp9), List(Ross Road, Hereford, AppleGreen, List(52.04677, -2.720323), HR2 7RJ, List(143.8, 134.8, 149.8, null), gcq044g47vhh), List(160 Cromer Road, Norwich, AppleGreen, List(52.665784, 1.2727), NR6 6XA, List(143.8, null, null, null), u12gsgwb2j9y), List(3-5 Fakenham Road, Drayton, Norwich, AppleGreen, List(52.675966, 1.223087), NR8 6PL, List(143.8, null, null, null), u12gev6eezzb), List(Bicester Road, Aylesbury, AppleGreen, List(51.823365, -0.825005), HP19 3BB, List(144.8, 135.8, 150.8, null), gcpqv4phzzrt), List(Tremains Road, Bridgend, AppleGreen, List(51.504362, -3.570213), CF31 1TZ, List(140.8, 135.8, null, null), gcjmnb068hbn), List(1091 Greenford Road, Greenford, AppleGreen, List(51.551779, -0.339203), UB6 0EJ, List(146.8, 138.8, 153.8, null), gcpv228yzupu), List(Swaythling Rd, Southampton, AppleGreen, List(50.933875, -1.348057), SO30 3AG, List(144.8, 137.8, null, null), gcp1c24r7frz), List(275 Streatham Common High Rd, Streatham, AppleGreen, List(51.419443, -0.128059), SW16 3BS, List(147.8, null, null, null), gcput0dgbnbm), List(157-159 Castle Street, Luton, AppleGreen, List(51.873204, -0.417639), LU1 3AA, List(145.8, null, null, null), gcpxn7pzk61f), List(243 North Circular Road, Palmers Green, AppleGreen, List(51.615032, -0.101109), N13 5JF, List(145.8, null, null, null), gcpvtsnr7grd), List(Derby Road, Ilkeston, AppleGreen, List(52.967187, -1.314899), DE7 5FH, List(142.8, 137.8, 155.8, null), gcrj646s0xk1), List(North Street, Ashford, AppleGreen, List(51.14651, 0.875301), TN24 8LQ, List(147.8, 139.8, 154.8, null), u10dgyvq5bx8), List(Station Road, Wadhurst, AppleGreen, List(51.0718, 0.315937), TN5 6RP, List(148.8, 140.8, 155.8, null), u104x1q4cce9), List(22 West Park Street, Chatteris, AppleGreen, List(52.4525, 0.04721), PE16 6AJ, List(143.8, 137.8, 152.8, null), u1243hdmundq), List(Banbury Road, Bicester, AppleGreen, List(51.9036, -1.15394), OX26 2HJ, List(140.8, null, null, null), gcppm8z8rhrf), List(Holloway Bank, Wednesbury, AppleGreen, List(52.545683, -2.017702), WS10 0NP, List(145.8, null, null, null), gcqdfjdub1k2), List(Bythesea Road, Trowbridge, AppleGreen, List(51.3189, -2.21108), BA14 8HR, List(143.8, 136.8, 149.8, null), gcn7vwmfq076), List(Small Heath Highway, Birmingham, AppleGreen, List(52.4662, -1.86386), B10 0AE, List(146.8, 139.8, null, null), gcqdmx4vbbt3), List(Ashley Green Road, Chesham, AppleGreen, List(51.7249, -0.598553), HP5 3PG, List(null, 142.8, null, null), gcpw62hpksnx), List(Whittlesey Road, Peterborough, AppleGreen, List(52.5503, -0.211631), PE2 8RR, List(null, 136.8, 152.8, null), gcrfgnmzstum), List(74 Main Road, Broomfield, AppleGreen, List(51.7582, 0.47456), CM1 7DH, List(146.8, null, null, null), u10q6y3965c9), List(A44 Bargates, Leominster, AppleGreen, List(52.227117, -2.742558), HR6 8EY, List(148.8, 141.8, 156.8, null), gcq11edvjb28), List(A5 Watling Street, Hinckley, AppleGreen, List(52.533601, -1.40986), LE10 3ED, List(144.8, 138.8, null, null), gcqfzgmq7vh0), List(Neath Road, Swansea, AppleGreen, List(51.663034, -3.920273), SA6 8EF, List(null, 136.8, 151.8, null), gcjjyucqgp8d), List(Port Talbot Road, Swansea, AppleGreen, List(51.591409, -3.779302), SA13 1HN, List(143.8, 136.8, null, null), gcjm3zzvxzsn), List(2 Bounds Green Road, Bowes Park, AppleGreen, List(51.605789, -0.122233), N11 2QH, List(146.8, 139.8, 154.8, null), gcpvt4wcxb26), List(96-106 Camberwell Road, Camberwell, AppleGreen, List(51.480946, -0.094468), SE5 0EG, List(142.8, 136.8, 151.8, null), gcpuvgg4nxzs), List(Dame Dorothy Street, Sunderland, AppleGreen, List(54.916445, -1.37036), SR6 0EZ, List(null, 136.8, 151.8, null), gcz02v4pkr3d), List(Shawbirch Crossroads, Telford, AppleGreen, List(52.717713, -2.524885), TF1 3QA, List(142.8, null, null, null), gcq5yscqvvvr), List(Lichfield Road, Walsall, AppleGreen, List(52.612486, -1.956587), WS4 1PQ, List(143.8, 137.8, null, null), gcqe73z3bnnz), List(Wrexham Road, Mold, AppleGreen, List(53.137161, -3.083567), CH7 4HL, List(144.8, 138.8, null, null), gcmy3c6d7svb), List(124 Berrow Road, Burnham on Sea, AppleGreen, List(51.251495, -2.998185), TA8 2PG, List(146.8, 139.8, null, null), gcjgef0c95ph), List(Barnhorn Road, Bexhill, AppleGreen, List(50.845435, 0.409505), TN39 4QR, List(144.8, null, null, null), u103324hnhj1), List(122 Holyhead Road, Birmingham, AppleGreen, List(52.509237, -1.955522), B21 0LL, List(144.8, 137.8, 151.8, null), gcqdex003p1p), List(121-129 Great Howard Street, Liverpool, AppleGreen, List(53.417318, -2.996594), L3 7AT, List(144.8, null, null, 159.8), gcmzgu3fqmc9), List(Preston Road, St Annes, Lytham, AppleGreen, List(53.743956, -2.93978), FY8 5AT, List(142.8, 138.8, null, null), gctctpg34pt1), List(Mains Lane, Little Singleton, Poulton Le Fylde, AppleGreen, List(53.857028, -2.977856), FY6 7LJ, List(144.8, null, null, null), gctfkhrefvtt), List(Hagley Road West, Birmingham, AppleGreen, List(52.462879, -1.998259), B32 2AL, List(null, 136.8, 151.8, null), gcqd6w8fmjqp), List(551 Kingstanding Road, Birmingham, AppleGreen, List(52.549405, -1.88377), B44 9TD, List(143.8, 136.8, null, null), gcqdvnk62zp6), List(Roundabout Service Station King Street, Clwyd, AppleGreen, List(53.170418, -3.138778), CH7 1LB, List(143.8, 137.8, null, null), gcmy2x6kjhm3), List(Seaways Service Station, The Roe, Clwyd, AppleGreen, List(53.260123, -3.45236), LL17 OLY, List(146.8, 141.8, null, null), gcmwcrwndntb), List(Summerhill Service Station, Basildon, AppleGreen, List(51.592036, 0.479075), SS14 3AS, List(142.8, 135.8, 150.8, null), u10mdbhc7k84), List(232 Whitehall road, Tipton, AppleGreen, List(52.528289, -2.031486), DY4 7EX, List(142.8, 135.8, null, null), gcqdcf2xqyy1), List(Colchester Road, Colchester, AppleGreen, List(51.868374, 1.074838), CO7 8RX, List(147.8, 139.8, 154.8, null), u10z06qejt9q), List(Bedworth Service Station, Nuneaton, AppleGreen, List(52.508706, -1.475005), CV10 7DA, List(147.8, null, null, 162.8), gcqfwqvvmuhx), List(Back Corkickle, Cumbria, AppleGreen, List(54.543568, -3.581624), CA28 7TS, List(146.8, 137.8, 153.8, null), gctqq3rgq458), List(Hull Road, North Humberside, AppleGreen, List(53.730004, -0.064847), HU12 8DJ, List(146.8, null, null, 161.8), gcxcwt0zbyww), List(Edlington Lane, Doncaster, AppleGreen, List(53.48486, -1.188221), DN12 1BS, List(144.8, 135.8, null, 159.8), gcx0kbwgbkw1), List(Small Heath Highway, Birmingham, AppleGreen, List(52.465736, -1.863702), B10 0AE, List(146.8, 139.8, null, null), gcqdmx4ghd51), List(Doncaster St George Service Station, Doncaster, AppleGreen, List(53.527394, -1.131019), DN1 2RE, List(145.8, 136.8, null, 160.8), gcx0w227tm7n), List(High Road, Epping, AppleGreen, List(51.729901, 0.123629), CM16 6LX, List(140.8, 136.8, null, null), u10n6c4h8b2r), List(Bromfield Road, Ludlow, AppleGreen, List(52.377698, -2.726326), SY8 1DN, List(146.8, 139.8, 154.8, null), gcq1czndpyxf), List(Waterloo Road, Stoke on Trent, AppleGreen, List(53.042816, -2.193355), ST6 2EH, List(142.8, null, null, null), gcqmy04ug1ef), List(Cockton Hill Road, Co Durham, AppleGreen, List(54.651083, -1.678374), DL14 6JN, List(142.8, 135.8, null, null), gcwycucveh38), List(Holyhead Road Service Station, Coventry, AppleGreen, List(52.418392, -1.545995), CV5 8ND, List(146.8, 140.8, null, null), gcqfhy6pr8qx), List(Frodsham Sevice Station, Frodsham, AppleGreen, List(53.299106, -2.713201), Wa6 7HN, List(143.8, 136.8, 152.8, null), gcqp4qb3cc6x), List(Stratford Road, Birmingham, AppleGreen, List(52.4421, -1.849825), B28 8BN, List(143.8, null, null, null), gcqdmfucrgyx), List(131 Humberstone Road, Leicester, AppleGreen, List(52.638934, -1.120383), LE5 3AD, List(145.8, 140.8, 155.8, null), gcr5qw8hj0tb), List(1 Wellington Road, Burton, AppleGreen, List(52.803078, -1.650772), DE14 2AP, List(147.9, 140.9, 156.9, 165.9), gcqu6ktbnppj), List(Applegreen Patchway Service Station, Bristol, AppleGreen, List(51.536079, -2.567768), BS34 6NA, List(144.8, 137.8, 152.8, null), gcnjjtf5qtmm))"


In [0]:
raw_gas_price_Applegreen.printSchema()

root
 |-- last_updated: string (nullable = true)
 |-- stations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- address: string (nullable = true)
 |    |    |-- brand: string (nullable = true)
 |    |    |-- location: struct (nullable = true)
 |    |    |    |-- latitude: double (nullable = true)
 |    |    |    |-- longitude: double (nullable = true)
 |    |    |-- postcode: string (nullable = true)
 |    |    |-- prices: struct (nullable = true)
 |    |    |    |-- B7: double (nullable = true)
 |    |    |    |-- E10: double (nullable = true)
 |    |    |    |-- E5: double (nullable = true)
 |    |    |    |-- SDV: double (nullable = true)
 |    |    |-- site_id: string (nullable = true)



In [0]:
from pyspark.sql.functions import explode

gas_stations = raw_gas_price_Applegreen.select(explode("stations").alias("stations"))
display(gas_stations.count())

65

In [0]:
gas_stations.printSchema()

root
 |-- stations: struct (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- brand: string (nullable = true)
 |    |-- location: struct (nullable = true)
 |    |    |-- latitude: double (nullable = true)
 |    |    |-- longitude: double (nullable = true)
 |    |-- postcode: string (nullable = true)
 |    |-- prices: struct (nullable = true)
 |    |    |-- B7: double (nullable = true)
 |    |    |-- E10: double (nullable = true)
 |    |    |-- E5: double (nullable = true)
 |    |    |-- SDV: double (nullable = true)
 |    |-- site_id: string (nullable = true)



In [0]:
display(gas_stations.limit(10))

stations
"List(High Street, Uckfield, AppleGreen, List(50.9669, 0.096173), TN22 5DL, List(145.8, 138.8, 153.8, null), u101fnnp8yzb)"
"List(23 Woodbridge Road East, Ipswich, AppleGreen, List(52.061264, 1.197083), IP4 5QN, List(141.8, 134.8, 149.8, null), u12b5jrwy51b)"
"List(A10 Bypass, Ely, AppleGreen, List(52.462489, 0.28666), CB6 1SE, List(148.8, 140.8, 154.8, null), u124qw88psv0)"
"List(Salop Road, Welshpool, AppleGreen, List(52.662139, -3.139685), SY21 7ET, List(145.8, null, null, null), gcmg8dcg0n1p)"
"List(Crossways, Church Stretton, AppleGreen, List(52.537839, -2.802029), SY6 6PQ, List(148.8, 143.8, 158.8, null), gcq4bhpx5gp9)"
"List(Ross Road, Hereford, AppleGreen, List(52.04677, -2.720323), HR2 7RJ, List(143.8, 134.8, 149.8, null), gcq044g47vhh)"
"List(160 Cromer Road, Norwich, AppleGreen, List(52.665784, 1.2727), NR6 6XA, List(143.8, null, null, null), u12gsgwb2j9y)"
"List(3-5 Fakenham Road, Drayton, Norwich, AppleGreen, List(52.675966, 1.223087), NR8 6PL, List(143.8, null, null, null), u12gev6eezzb)"
"List(Bicester Road, Aylesbury, AppleGreen, List(51.823365, -0.825005), HP19 3BB, List(144.8, 135.8, 150.8, null), gcpqv4phzzrt)"
"List(Tremains Road, Bridgend, AppleGreen, List(51.504362, -3.570213), CF31 1TZ, List(140.8, 135.8, null, null), gcjmnb068hbn)"


In [0]:
json_list_stations = raw_gas_price_Applegreen.select(
    "last_updated",
    explode("stations").alias("stations")
)
#display(json_list_stations.count())
display(json_list_stations.limit(10))

last_updated,stations
26/02/2025 11:45:37,"List(High Street, Uckfield, AppleGreen, List(50.9669, 0.096173), TN22 5DL, List(145.8, 138.8, 153.8, null), u101fnnp8yzb)"
26/02/2025 11:45:37,"List(23 Woodbridge Road East, Ipswich, AppleGreen, List(52.061264, 1.197083), IP4 5QN, List(141.8, 134.8, 149.8, null), u12b5jrwy51b)"
26/02/2025 11:45:37,"List(A10 Bypass, Ely, AppleGreen, List(52.462489, 0.28666), CB6 1SE, List(148.8, 140.8, 154.8, null), u124qw88psv0)"
26/02/2025 11:45:37,"List(Salop Road, Welshpool, AppleGreen, List(52.662139, -3.139685), SY21 7ET, List(145.8, null, null, null), gcmg8dcg0n1p)"
26/02/2025 11:45:37,"List(Crossways, Church Stretton, AppleGreen, List(52.537839, -2.802029), SY6 6PQ, List(148.8, 143.8, 158.8, null), gcq4bhpx5gp9)"
26/02/2025 11:45:37,"List(Ross Road, Hereford, AppleGreen, List(52.04677, -2.720323), HR2 7RJ, List(143.8, 134.8, 149.8, null), gcq044g47vhh)"
26/02/2025 11:45:37,"List(160 Cromer Road, Norwich, AppleGreen, List(52.665784, 1.2727), NR6 6XA, List(143.8, null, null, null), u12gsgwb2j9y)"
26/02/2025 11:45:37,"List(3-5 Fakenham Road, Drayton, Norwich, AppleGreen, List(52.675966, 1.223087), NR8 6PL, List(143.8, null, null, null), u12gev6eezzb)"
26/02/2025 11:45:37,"List(Bicester Road, Aylesbury, AppleGreen, List(51.823365, -0.825005), HP19 3BB, List(144.8, 135.8, 150.8, null), gcpqv4phzzrt)"
26/02/2025 11:45:37,"List(Tremains Road, Bridgend, AppleGreen, List(51.504362, -3.570213), CF31 1TZ, List(140.8, 135.8, null, null), gcjmnb068hbn)"


In [0]:
json_list_stations.printSchema()

root
 |-- last_updated: string (nullable = true)
 |-- stations: struct (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- brand: string (nullable = true)
 |    |-- location: struct (nullable = true)
 |    |    |-- latitude: double (nullable = true)
 |    |    |-- longitude: double (nullable = true)
 |    |-- postcode: string (nullable = true)
 |    |-- prices: struct (nullable = true)
 |    |    |-- B7: double (nullable = true)
 |    |    |-- E10: double (nullable = true)
 |    |    |-- E5: double (nullable = true)
 |    |    |-- SDV: double (nullable = true)
 |    |-- site_id: string (nullable = true)



In [0]:
flat_stations = json_list_stations.select(
    "last_updated",
    "stations.address",
    "stations.brand",
    "stations.location.latitude",
    "stations.location.longitude",
    "stations.postcode",
    "stations.prices.B7",
    "stations.prices.E10",
    "stations.prices.E5",
    "stations.prices.SDV",
    "stations.site_id"
)
display(flat_stations.limit(10))

last_updated,address,brand,latitude,longitude,postcode,B7,E10,E5,SDV,site_id
26/02/2025 11:45:37,"High Street, Uckfield",AppleGreen,50.9669,0.096173,TN22 5DL,145.8,138.8,153.8,,u101fnnp8yzb
26/02/2025 11:45:37,"23 Woodbridge Road East, Ipswich",AppleGreen,52.061264,1.197083,IP4 5QN,141.8,134.8,149.8,,u12b5jrwy51b
26/02/2025 11:45:37,"A10 Bypass, Ely",AppleGreen,52.462489,0.28666,CB6 1SE,148.8,140.8,154.8,,u124qw88psv0
26/02/2025 11:45:37,"Salop Road, Welshpool",AppleGreen,52.662139,-3.139685,SY21 7ET,145.8,,,,gcmg8dcg0n1p
26/02/2025 11:45:37,"Crossways, Church Stretton",AppleGreen,52.537839,-2.802029,SY6 6PQ,148.8,143.8,158.8,,gcq4bhpx5gp9
26/02/2025 11:45:37,"Ross Road, Hereford",AppleGreen,52.04677,-2.720323,HR2 7RJ,143.8,134.8,149.8,,gcq044g47vhh
26/02/2025 11:45:37,"160 Cromer Road, Norwich",AppleGreen,52.665784,1.2727,NR6 6XA,143.8,,,,u12gsgwb2j9y
26/02/2025 11:45:37,"3-5 Fakenham Road, Drayton, Norwich",AppleGreen,52.675966,1.223087,NR8 6PL,143.8,,,,u12gev6eezzb
26/02/2025 11:45:37,"Bicester Road, Aylesbury",AppleGreen,51.823365,-0.825005,HP19 3BB,144.8,135.8,150.8,,gcpqv4phzzrt
26/02/2025 11:45:37,"Tremains Road, Bridgend",AppleGreen,51.504362,-3.570213,CF31 1TZ,140.8,135.8,,,gcjmnb068hbn


In [0]:
flat_stations.printSchema()

root
 |-- last_updated: string (nullable = true)
 |-- address: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- postcode: string (nullable = true)
 |-- B7: double (nullable = true)
 |-- E10: double (nullable = true)
 |-- E5: double (nullable = true)
 |-- SDV: double (nullable = true)
 |-- site_id: string (nullable = true)



In [0]:
#using get field
flat_stations_get_field = json_list_stations.select(
    "last_updated",
    col("stations").getField("address").alias("address"),
    col("stations").getField("brand").alias("brand"),
    col("stations").getField("location").getField("latitude").alias("latitude"),
    col("stations").getField("location").getField("longitude").alias("longitude"),
    col("stations").getField("postcode").alias("postcode"),
    col("stations").getField("prices").getField("B7").alias("B7"),
    col("stations").getField("prices").getField("E10").alias("E10"),
    col("stations").getField("prices").getField("E5").alias("E5"),
    col("stations").getField("prices").getField("SDV").alias("SDV"),
    col("stations").getField("site_id").alias("site_id")
)
display(flat_stations_get_field.limit(10))


last_updated,address,brand,latitude,longitude,postcode,B7,E10,E5,SDV,site_id
26/02/2025 11:45:37,"High Street, Uckfield",AppleGreen,50.9669,0.096173,TN22 5DL,145.8,138.8,153.8,,u101fnnp8yzb
26/02/2025 11:45:37,"23 Woodbridge Road East, Ipswich",AppleGreen,52.061264,1.197083,IP4 5QN,141.8,134.8,149.8,,u12b5jrwy51b
26/02/2025 11:45:37,"A10 Bypass, Ely",AppleGreen,52.462489,0.28666,CB6 1SE,148.8,140.8,154.8,,u124qw88psv0
26/02/2025 11:45:37,"Salop Road, Welshpool",AppleGreen,52.662139,-3.139685,SY21 7ET,145.8,,,,gcmg8dcg0n1p
26/02/2025 11:45:37,"Crossways, Church Stretton",AppleGreen,52.537839,-2.802029,SY6 6PQ,148.8,143.8,158.8,,gcq4bhpx5gp9
26/02/2025 11:45:37,"Ross Road, Hereford",AppleGreen,52.04677,-2.720323,HR2 7RJ,143.8,134.8,149.8,,gcq044g47vhh
26/02/2025 11:45:37,"160 Cromer Road, Norwich",AppleGreen,52.665784,1.2727,NR6 6XA,143.8,,,,u12gsgwb2j9y
26/02/2025 11:45:37,"3-5 Fakenham Road, Drayton, Norwich",AppleGreen,52.675966,1.223087,NR8 6PL,143.8,,,,u12gev6eezzb
26/02/2025 11:45:37,"Bicester Road, Aylesbury",AppleGreen,51.823365,-0.825005,HP19 3BB,144.8,135.8,150.8,,gcpqv4phzzrt
26/02/2025 11:45:37,"Tremains Road, Bridgend",AppleGreen,51.504362,-3.570213,CF31 1TZ,140.8,135.8,,,gcjmnb068hbn


In [0]:
flat_stations_get_field.printSchema()

root
 |-- last_updated: string (nullable = true)
 |-- address: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- postcode: string (nullable = true)
 |-- B7: double (nullable = true)
 |-- E10: double (nullable = true)
 |-- E5: double (nullable = true)
 |-- SDV: double (nullable = true)
 |-- site_id: string (nullable = true)

