##### Access Azure data lake and read the data using
1. Databricks secretScope
2. Azure key-vault

In [0]:
dbutils.secrets.help()

In [0]:
dbutils.secrets.listScopes()

Out[2]: [SecretScope(name='kijiji-scope')]

In [0]:
dbutils.secrets.list(scope='kijiji-scope')

Out[3]: [SecretMetadata(key='account-key')]

In [0]:
account_key = dbutils.secrets.get(scope='kijiji-scope',key='account-key')

In [0]:
spark.conf.set(
    "fs.azure.account.key.kijijidata.dfs.core.windows.net",
    account_key
)

##### check the access to ADLS

In [0]:
display(dbutils.fs.ls("abfs://raw@kijijidata.dfs.core.windows.net"))

path,name,size,modificationTime
abfs://raw@kijijidata.dfs.core.windows.net/apt/,apt/,0,1680633481000
abfs://raw@kijijidata.dfs.core.windows.net/house/,house/,0,1680633487000


In [0]:
display(dbutils.fs.ls("abfs://raw@kijijidata.dfs.core.windows.net/apt/"))

path,name,size,modificationTime
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-04/,2023-04-04/,0,1680638934000
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-05/,2023-04-05/,0,1680688934000


In [0]:
display(dbutils.fs.ls("abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-04/"))

path,name,size,modificationTime
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-04/kijiji_apt_21_2023-04-04.csv,kijiji_apt_21_2023-04-04.csv,41974,1680638935000
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-04/kijiji_apt_41_2023-04-04.csv,kijiji_apt_41_2023-04-04.csv,40532,1680638933000
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-04/kijiji_apt_61_2023-04-04.csv,kijiji_apt_61_2023-04-04.csv,42355,1680638933000
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-04/kijiji_apt_81_2023-04-04.csv,kijiji_apt_81_2023-04-04.csv,41033,1680638930000


In [0]:
display(spark.read.csv("abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-04/kijiji_apt_21_2023-04-04.csv"))

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8
,id,title,location,rent,bed_rooms,date_posted,url,scraped_on
0.0,1569609232,RENT TO OWN A HOUSE OR BUY WITH OUR ZERO DOWN PROGRAM,City of Toronto,Please Contact,3,02/04/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/rent-to-own-a-house-or-buy-with-our-zero-down-program/1569609232,2023-04-04
1.0,1636404248,1 Bedroom Large Apt. for Rent Steps to Victoria Park Station!,Toronto,"$2,450.00",1,09/03/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/1-bedroom-large-apt-for-rent-steps-to-victoria-park-station/1636404248,2023-04-04
2.0,1655269031,"5 BDRM HOUSE, availaible immediately, Christie and St. Clair",Toronto,"$4,500.00",5+,02/04/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/5-bdrm-house-availaible-immediately-christie-and-st-clair/1655269031,2023-04-04
3.0,1651994882,"PROMO! 1 Bedroom Suite SAVE $3,400 - $6,900.",City of Toronto,"$2,389.00",1,30/03/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/promo-1-bedroom-suite-save-3-400-6-900/1651994882,2023-04-04
4.0,1640614569,2 Bedroom Apartments in the Heart of Yonge and Lawrene,Toronto,"$2,699.00",2,< 21 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/2-bedroom-apartments-in-the-heart-of-yonge-and-lawrene/1640614569,2023-04-04
5.0,1655420902,Fabulous 3B House w Double Garage near Wilson Subway Station,Toronto,"$2,900.00",3,< 17 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/fabulous-3b-house-w-double-garage-near-wilson-subway-station/1655420902,2023-04-04
6.0,1505831086,NEW PRICE!! 1bd fully RENOVATED suite $1625,City of Toronto,"$1,695.00",1,< 17 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/new-price-1bd-fully-renovated-suite-1625/1505831086,2023-04-04
7.0,1653559891,One large bedroom available for rent with attached bath.,Toronto,"$1,600.00",1,< 17 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/one-large-bedroom-available-for-rent-with-attached-bath/1653559891,2023-04-04
8.0,1655420602,End Unit Executive Townhouse in Hamilton/Ancaster,Brampton,"$2,800.00",3,< 17 hours ago,https://www.kijiji.ca/v-apartments-condos/mississauga-peel-region/end-unit-executive-townhouse-in-hamilton-ancaster/1655420602,2023-04-04


###### Perform necessary transformations and save the result as parquet to ADLS
1. read apt listsing and house rental listings from adls
2. Specify Schema
3. drop null column
4. make title column smaller case
5. clean location - group same text to one, eq :- City of Toronto, Toronto --> toronto
6. convert rent column to integer and replace text please contact with 0
7. clean bed_rooms columns and cast to int
7. recalculate the date_posted column. eq :-  21 hours ago should be converted to proper date format
8. create two additional columns. eq :- year and month column

In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DateType
from pyspark.sql.functions import lower, udf, col, when, regexp_replace, trim, to_date, date_add
from dateutil.relativedelta import relativedelta

In [0]:
rental_schema = StructType(fields=[
                                StructField("null", IntegerType(), True),
                                StructField("id", IntegerType(), True),
                                StructField("title", StringType(), True),
                                StructField("location", StringType(), True),
                                StructField("rent", StringType(), True),
                                StructField("bed_rooms", StringType(), True),
                                StructField("date_posted", StringType(), True),
                                StructField("url", StringType(), True),
                                StructField("scraped_on", DateType(), True)
                            ])

In [0]:
apt_df = spark.read\
              .option("header", True)\
              .schema(rental_schema)\
              .csv("abfs://raw@kijijidata.dfs.core.windows.net/apt/**/*.csv")\
              .drop("null")

In [0]:
apt_df.count()

Out[13]: 2661

In [0]:
apt_df = apt_df.dropna(subset=["id"])

In [0]:
apt_df.count()

Out[15]: 2655

In [0]:
display(apt_df)

id,title,location,rent,bed_rooms,date_posted,url,scraped_on
1653664103,Story of Brampton Central - 2 Bed + 2 Bath in the Heart of Bramp,Brampton,"$2,640.00",2,< 9 hours ago,https://www.kijiji.ca/v-apartments-condos/mississauga-peel-region/story-of-brampton-central-2-bed-2-bath-in-the-heart-of-bramp/1653664103,2023-04-05
1648230148,Brand New 3-Bedroom in North York! York Mills Rd. & DVP!,City of Toronto,"$3,023.00",3,24/03/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/brand-new-3-bedroom-in-north-york-york-mills-rd-dvp/1648230148,2023-04-05
1649321807,Studio 1BR & 2BR Brand New Condo unit at LakeShore & Queens Quay,City of Toronto,"$2,050.00",2,< 10 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/studio-1br-2br-brand-new-condo-unit-at-lakeshore-queens-quay/1649321807,2023-04-05
1635479573,3 BEDROOM + 1.5 BATHROOM TOWNHOUSE FOR RENT!,Oshawa / Durham Region,"$2,695.00",3,01/04/2023,https://www.kijiji.ca/v-apartments-condos/oshawa-durham-region/3-bedroom-1-5-bathroom-townhouse-for-rent/1635479573,2023-04-05
1618391411,2 Bedroom Apartment for Rent Steps to Victoria Park Station!,City of Toronto,"$2,945.00",2,19/03/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/2-bedroom-apartment-for-rent-steps-to-victoria-park-station/1618391411,2023-04-05
1655532535,2 beds basement apt for rent from May 1,Churchville,"$2,400.00",2,< 2 minutes ago,https://www.kijiji.ca/v-apartments-condos/mississauga-peel-region/2-beds-basement-apt-for-rent-from-may-1/1655532535,2023-04-05
1654804799,Ground Floor - Bachelor Unit - Mimico,Mississauga / Peel Region,"$1,545.00",Bachelor/Studio,< 8 minutes ago,https://www.kijiji.ca/v-apartments-condos/mississauga-peel-region/ground-floor-bachelor-unit-mimico/1654804799,2023-04-05
1653373622,Executive Heritage Suite - Rosedale/Moore Park,City of Toronto,"$4,100.00",3,< 8 minutes ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/executive-heritage-suite-rosedale-moore-park/1653373622,2023-04-05
1648282950,Luxury Living calling your name.,City of Toronto,"$2,824.00",2,< 8 minutes ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/luxury-living-calling-your-name/1648282950,2023-04-05
1648283207,Luxury Living calling your name.,City of Toronto,"$2,399.00",1,< 8 minutes ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/luxury-living-calling-your-name/1648283207,2023-04-05


In [0]:
apt_df = apt_df.withColumn("title", lower(apt_df["title"]))\
               .withColumn("location", when(col("location").isNull(), "Not Available").otherwise(lower(col("location"))))\
               .withColumn("rent", when(col("rent") == "Please Contact", 0).otherwise(col("rent")))\
               .withColumn("rent", trim(regexp_replace(col("rent"), "[$,]", "")).cast("int"))\
               .withColumn("bed_rooms", when(col("bed_rooms").isNull(), 0).otherwise(lower(col("bed_rooms"))))

In [0]:
unique_locations = apt_df.select("location").distinct().collect()

In [0]:
for location in unique_locations:
    print(location["location"])

town of caledon
concord
acton
oshawa / durham region
orangeville
markham / york region
bradford
caledon
toronto
burlington
milton
cobourg
blackstock
nobleton
brampton
city of toronto
churchville
mississauga
east york
kettleby
vaughan
orillia
etobicoke
whitchurch-stouffville
streetsville
erin mills
newmarket
oshawa
georgetown
ajax
bolton
new tecumseth
markham
hamilton
donevan
north york
whitby
mississauga / peel region
bowmanville
deerfield
richmond hill
halton hills
east gwillimbury
thornhill
newcastle
oakville / halton region
georgina
meadowvale village
cambridge
courtice
uxbridge
eastdale
scarborough
regional municipality of york
lefroy
old toronto
barrie
Not Available
hampton
oakville
campbellville
maple
pickering
aurora
bradford west gwillimbury
kitchener
meadowvale
township of guelph/eramosa
east credit
woodbridge
township of melancthon
valley creek
york
grimsby
malton
caledon east
ottawa
palermo west
king city


In [0]:
apt_df[apt_df["location"]=="city of toronto"].show()

+----------+--------------------+---------------+----+---------------+---------------+--------------------+----------+
|        id|               title|       location|rent|      bed_rooms|    date_posted|                 url|scraped_on|
+----------+--------------------+---------------+----+---------------+---------------+--------------------+----------+
|1648230148|brand new 3-bedro...|city of toronto|3023|              3|     24/03/2023|https://www.kijij...|2023-04-05|
|1649321807|studio 1br & 2br ...|city of toronto|2050|              2| < 10 hours ago|https://www.kijij...|2023-04-05|
|1618391411|2 bedroom apartme...|city of toronto|2945|              2|     19/03/2023|https://www.kijij...|2023-04-05|
|1653373622|executive heritag...|city of toronto|4100|              3|< 8 minutes ago|https://www.kijij...|2023-04-05|
|1648282950|luxury living cal...|city of toronto|2824|              2|< 8 minutes ago|https://www.kijij...|2023-04-05|
|1648283207|luxury living cal...|city of toronto

In [0]:
@udf(returnType=StringType())
def clean_location(location):
    # split the location text by / and return first element
    # eq : mississauga / peel region - > mississauga
    location_dict = {"city of toronto":"toronto"}
    if str(location) == "None":
        return "N/A"
    location = location.split("/")[0]
    if location in location_dict:
        return location_dict[location]
    return location.strip()

In [0]:
apt_df = apt_df.withColumn("location", clean_location(apt_df["location"]))

In [0]:
unique_beds = apt_df.select("bed_rooms").distinct().collect()
for beds in unique_beds:
    print(beds["bed_rooms"])

2 + den
1 + den
3
4 + den
0
3 + den
5+
bachelor/studio
1
4
2


In [0]:
@udf(returnType=IntegerType())
def clean_beds(beds):
    if 'bachelor' in beds:
        return 0
    if 'studio' in beds:
        return 0
    
    beds = beds.split("+")[0]
    return int(beds)

In [0]:
apt_df = apt_df.withColumn("bed_rooms", clean_beds(apt_df["bed_rooms"]))

In [0]:
unique_beds = apt_df.select("bed_rooms").distinct().collect()
for beds in unique_beds:
    print(beds["bed_rooms"])

1
3
5
4
2
0


In [0]:
from pyspark.sql.functions import year, month, to_date, datediff
from dateutil.relativedelta import relativedelta
from datetime import datetime, date, timedelta

In [0]:
unique_dates = apt_df.select("date_posted").distinct().collect()
for dates in unique_dates:
    print(dates["date_posted"])

08/03/2023
< 9 hours ago
< 19 minutes ago
31/03/2023
< 7 hours ago
29/03/2023
13/03/2023
None
< 8 minutes ago
< 6 hours ago
< 19 hours ago
26/03/2023
25/03/2023
11/03/2023
< 2 minutes ago
06/03/2023
22/02/2023
22/03/2023
14/03/2023
< 5 hours ago
21/03/2023
< 11 hours ago
< 16 hours ago
< 18 hours ago
23/03/2023
27/03/2023
17/03/2023
< 2 hours ago
01/03/2023
< 4 hours ago
30/03/2023
< 17 hours ago
20/03/2023
18/03/2023
< 3 hours ago
09/03/2023
< 12 hours ago
19/03/2023
< 10 hours ago
24/03/2023
28/03/2023
10/03/2023
< 15 hours ago
< 8 hours ago
02/04/2023
07/03/2023
< 14 hours ago
03/04/2023
01/04/2023
< 13 hours ago
< 20 hours ago
27/02/2023
< 21 hours ago
< 23 hours ago
< 24 hours ago
02/03/2023
< 22 hours ago


In [0]:
@udf(returnType=DateType())
def parse_date_string(date_string, date_=None):
    
    if str(date_string) == "None":
        return date_
    if date_ is None:
        now = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
    else:
        now = datetime.strptime(str(date_), '%Y-%m-%d').replace(hour=0, minute=0, second=0, microsecond=0)
    if date_string == 'yesterday':
        return (now - relativedelta(days=1)).date()
    elif 'minute' in date_string:
        return now.date()
    elif date_string.startswith('<'):
        # Convert relative time to absolute time
        minutes_ago = int(date_string.split()[1])
        return datetime.now() - timedelta(minutes=minutes_ago)
    else:
        return datetime.strptime(date_string, '%d/%m/%Y').date()

In [0]:
apt_df = apt_df.withColumn("date_posted_calculated", parse_date_string(apt_df["date_posted"], apt_df["scraped_on"]))

In [0]:
apt_df = apt_df.withColumn("year", year(apt_df["date_posted_calculated"]))\
               .withColumn("month", month(apt_df["date_posted_calculated"]))

In [0]:
apt_df.count()

Out[33]: 2655

In [0]:
apt_df = apt_df.withColumnRenamed('id', 'listing_id')\
       .withColumnRenamed('bed_rooms', 'bedrooms')\
       .withColumnRenamed('date_posted', 'post_date')\
       .withColumnRenamed('scraped_on', 'scrape_date')\
       .withColumnRenamed("date_posted_calculated", "calculated_date")


In [0]:
display(apt_df)

listing_id,title,location,rent,bedrooms,post_date,url,scrape_date,calculated_date,year,month
1653664103,story of brampton central - 2 bed + 2 bath in the heart of bramp,brampton,2640.0,2,< 9 hours ago,https://www.kijiji.ca/v-apartments-condos/mississauga-peel-region/story-of-brampton-central-2-bed-2-bath-in-the-heart-of-bramp/1653664103,2023-04-05,2023-04-05,2023.0,4.0
1648230148,brand new 3-bedroom in north york! york mills rd. & dvp!,toronto,3023.0,3,24/03/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/brand-new-3-bedroom-in-north-york-york-mills-rd-dvp/1648230148,2023-04-05,2023-03-24,2023.0,3.0
1649321807,studio 1br & 2br brand new condo unit at lakeshore & queens quay,toronto,2050.0,2,< 10 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/studio-1br-2br-brand-new-condo-unit-at-lakeshore-queens-quay/1649321807,2023-04-05,2023-04-05,2023.0,4.0
1635479573,3 bedroom + 1.5 bathroom townhouse for rent!,oshawa,2695.0,3,01/04/2023,https://www.kijiji.ca/v-apartments-condos/oshawa-durham-region/3-bedroom-1-5-bathroom-townhouse-for-rent/1635479573,2023-04-05,2023-04-01,2023.0,4.0
1618391411,2 bedroom apartment for rent steps to victoria park station!,toronto,2945.0,2,19/03/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/2-bedroom-apartment-for-rent-steps-to-victoria-park-station/1618391411,2023-04-05,2023-03-19,2023.0,3.0
1655532535,2 beds basement apt for rent from may 1,churchville,2400.0,2,< 2 minutes ago,https://www.kijiji.ca/v-apartments-condos/mississauga-peel-region/2-beds-basement-apt-for-rent-from-may-1/1655532535,2023-04-05,2023-04-05,2023.0,4.0
1654804799,ground floor - bachelor unit - mimico,mississauga,1545.0,0,< 8 minutes ago,https://www.kijiji.ca/v-apartments-condos/mississauga-peel-region/ground-floor-bachelor-unit-mimico/1654804799,2023-04-05,2023-04-05,2023.0,4.0
1653373622,executive heritage suite - rosedale/moore park,toronto,4100.0,3,< 8 minutes ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/executive-heritage-suite-rosedale-moore-park/1653373622,2023-04-05,2023-04-05,2023.0,4.0
1648282950,luxury living calling your name.,toronto,2824.0,2,< 8 minutes ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/luxury-living-calling-your-name/1648282950,2023-04-05,2023-04-05,2023.0,4.0
1648283207,luxury living calling your name.,toronto,2399.0,1,< 8 minutes ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/luxury-living-calling-your-name/1648283207,2023-04-05,2023-04-05,2023.0,4.0


###### write the processed data back to ADLS

In [0]:
apt_df.write\
      .format('parquet')\
      .mode('overwrite')\
      .partitionBy("location")\
      .save('abfs://processed@kijijidata.dfs.core.windows.net/apt/')


##### move processed data from raw to raw-processed

In [0]:
dbutils.fs.mv("abfs://raw@kijijidata.dfs.core.windows.net/apt/", "abfs://raw-processed@kijijidata.dfs.core.windows.net/apt/", recurse=True)

Out[38]: True