##### Access Azure data lake and read the data using
1. Databricks secretScope
2. Azure key-vault

In [0]:
dbutils.secrets.help()

In [0]:
dbutils.secrets.listScopes()

Out[39]: [SecretScope(name='kijiji-data-scope')]

In [0]:
dbutils.secrets.list(scope='kijiji-data-scope')

Out[40]: [SecretMetadata(key='account-key')]

In [0]:
account_key = dbutils.secrets.get(scope='kijiji-data-scope',key='account-key')

In [0]:
spark.conf.set(
    "fs.azure.account.key.kijijidata.dfs.core.windows.net",
    account_key
)

##### check the access to ADLS

In [0]:
display(dbutils.fs.ls("abfs://raw@kijijidata.dfs.core.windows.net"))

path,name,size,modificationTime
abfs://raw@kijijidata.dfs.core.windows.net/apt/,apt/,0,1680729431000
abfs://raw@kijijidata.dfs.core.windows.net/house/,house/,0,1680729436000


In [0]:
display(dbutils.fs.ls("abfs://raw@kijijidata.dfs.core.windows.net/apt/"))

path,name,size,modificationTime
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-06/,2023-04-06/,0,1680788559000


In [0]:
display(dbutils.fs.ls("abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-06/"))

path,name,size,modificationTime
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-06/kijiji_apt_1_2023-04-06.csv,kijiji_apt_1_2023-04-06.csv,203763,1680788489000
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-06/kijiji_apt_21_2023-04-06.csv,kijiji_apt_21_2023-04-06.csv,82250,1680788557000
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-06/kijiji_apt_41_2023-04-06.csv,kijiji_apt_41_2023-04-06.csv,72137,1680788559000
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-06/kijiji_apt_61_2023-04-06.csv,kijiji_apt_61_2023-04-06.csv,92029,1680788552000
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-06/kijiji_apt_81_2023-04-06.csv,kijiji_apt_81_2023-04-06.csv,82485,1680788555000


In [0]:
display(spark.read.csv("abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-06/kijiji_apt_1_2023-04-06.csv").limit(5))

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8
,id,title,location,rent,bed_rooms,date_posted,url,scraped_on
0.0,1649856293,Large Luxury 2-Bedroom Toronto Apartment South West Facing,Toronto,"$2,799.00",2,01/04/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/large-luxury-2-bedroom-toronto-apartment-south-west-facing/1649856293,2023-04-06
1.0,1649856299,"1 MONTH FREE! Deluxe Open Concept 1 Bedroom Apartment, Toronto",Toronto,"$2,349.00",1,< 15 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/1-month-free-deluxe-open-concept-1-bedroom-apartment-toronto/1649856299,2023-04-06
2.0,1653158018,"Sunny, spacious 2 bdrm. circa 1898 Edwardian Roncy Village",Toronto,"$2,595.00",2,< 22 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/sunny-spacious-2-bdrm-circa-1898-edwardian-roncy-village/1653158018,2023-04-06
3.0,1651533067,1 Bedroom Large Renovated Apartment For Rent in Toronto - 90 Eas,Toronto,"$2,195.00",1,26/03/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/1-bedroom-large-renovated-apartment-for-rent-in-toronto-90-eas/1651533067,2023-04-06


###### Perform necessary transformations and save the result as parquet to ADLS
1. read apt listsing and house rental listings from adls
2. Specify Schema
3. drop null column
4. make title column smaller case
5. clean location - group same text to one, eq :- City of Toronto, Toronto --> toronto
6. convert rent column to integer and replace text please contact with 0
7. clean bed_rooms columns and cast to int
7. recalculate the date_posted column. eq :-  21 hours ago should be converted to proper date format
8. create two additional columns. eq :- year and month column

In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DateType
from pyspark.sql.functions import lower, udf, col, when, regexp_replace, trim, to_date, date_add
from dateutil.relativedelta import relativedelta

In [0]:
rental_schema = StructType(fields=[
                                StructField("null", IntegerType(), True),
                                StructField("id", IntegerType(), True),
                                StructField("title", StringType(), True),
                                StructField("location", StringType(), True),
                                StructField("rent", StringType(), True),
                                StructField("bed_rooms", StringType(), True),
                                StructField("date_posted", StringType(), True),
                                StructField("url", StringType(), True),
                                StructField("scraped_on", DateType(), True)
                            ])

In [0]:
apt_df = spark.read\
              .option("header", True)\
              .schema(rental_schema)\
              .csv("abfs://raw@kijijidata.dfs.core.windows.net/apt/**/*.csv")\
              .drop("null")

In [0]:
apt_df.count()

Out[50]: 2304

In [0]:
apt_df = apt_df.dropna(subset=["id"])

In [0]:
apt_df.count()

Out[52]: 2295

In [0]:
display(apt_df.limit(5))

id,title,location,rent,bed_rooms,date_posted,url,scraped_on
1649856293,Large Luxury 2-Bedroom Toronto Apartment South West Facing,Toronto,"$2,799.00",2,01/04/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/large-luxury-2-bedroom-toronto-apartment-south-west-facing/1649856293,2023-04-06
1649856299,"1 MONTH FREE! Deluxe Open Concept 1 Bedroom Apartment, Toronto",Toronto,"$2,349.00",1,< 15 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/1-month-free-deluxe-open-concept-1-bedroom-apartment-toronto/1649856299,2023-04-06
1653158018,"Sunny, spacious 2 bdrm. circa 1898 Edwardian Roncy Village",Toronto,"$2,595.00",2,< 22 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/sunny-spacious-2-bdrm-circa-1898-edwardian-roncy-village/1653158018,2023-04-06
1651533067,1 Bedroom Large Renovated Apartment For Rent in Toronto - 90 Eas,Toronto,"$2,195.00",1,26/03/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/1-bedroom-large-renovated-apartment-for-rent-in-toronto-90-eas/1651533067,2023-04-06
1654957801,3Bdrm+2 Bath-Penthouse (1660 sqft) at Queen and Spadina,Toronto,"$3,950.00",3,30/03/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/3bdrm-2-bath-penthouse-1660-sqft-at-queen-and-spadina/1654957801,2023-04-06


In [0]:
apt_df = apt_df.withColumn("title", lower(apt_df["title"]))\
               .withColumn("date_posted", lower(apt_df["date_posted"]))\
               .withColumn("location", when(col("location").isNull(), "Not Available").otherwise(lower(col("location"))))\
               .withColumn("rent", when(col("rent") == "Please Contact", 0).otherwise(col("rent")))\
               .withColumn("rent", trim(regexp_replace(col("rent"), "[$,]", "")).cast("int"))\
               .withColumn("bed_rooms", when(col("bed_rooms").isNull(), 0).otherwise(lower(col("bed_rooms"))))

In [0]:
unique_locations = apt_df.select("location").distinct().collect()

In [0]:
for location in unique_locations:
    print(location["location"])

kitchener
town of caledon
concord
acton
oshawa / durham region
orangeville
markham / york region
meadowvale
roseneath
bradford
ashburn
caledon
toronto
burlington
schomberg
milton
cobourg
city of toronto
brampton
township of guelph/eramosa
mississauga
east york
kettleby
vaughan
mineola
toronto m9m 0b5
etobicoke
whitchurch-stouffville
erin mills
newmarket
oshawa
east credit
georgetown
ajax
bolton
cooksville
markham
palermo west
unionville
north york
king
whitby
mississauga / peel region
woodbridge
bowmanville
richmond hill
halton hills
thornhill
newcastle
niagara on the lake
oakville / halton region
meadowvale village
georgina
courtice
uxbridge
missisauga
eastdale
scarborough
york
old toronto
lakeview
hampton
oakville
campbellville
maple
pickering
aurora
bradford west gwillimbury
inglewood
churchville
north yorks
hamilton
port perry
bramalea
rockwood
Not Available
regional municipality of peel
innisfil
donevan
lefroy
barrie
windfields


In [0]:
apt_df[apt_df["location"]=="city of toronto"].show(5)

+----------+--------------------+---------------+----+---------------+---------------+--------------------+----------+
|        id|               title|       location|rent|      bed_rooms|    date_posted|                 url|scraped_on|
+----------+--------------------+---------------+----+---------------+---------------+--------------------+----------+
|1627845120|1 bedroom east yo...|city of toronto|1949|              1|< 7 minutes ago|https://www.kijij...|2023-04-06|
|1635837021|large fully renov...|city of toronto|2039|              1|< 8 minutes ago|https://www.kijij...|2023-04-06|
|1635836521|newly renovated b...|city of toronto|1762|bachelor/studio|< 8 minutes ago|https://www.kijij...|2023-04-06|
|1626968899|east york reno 1 ...|city of toronto|2144|              1|< 8 minutes ago|https://www.kijij...|2023-04-06|
|1645423583|new renos, large ...|city of toronto|1979|              1|< 8 minutes ago|https://www.kijij...|2023-04-06|
+----------+--------------------+---------------

In [0]:
@udf(returnType=StringType())
def clean_location(location):
    # split the location text by / and return first element
    # eq : mississauga / peel region - > mississauga
    location_dict = {"city of toronto":"toronto"}
    if str(location) == "None":
        return "N/A"
    location = location.split("/")[0]
    if location in location_dict:
        return location_dict[location]
    return location.strip()

In [0]:
apt_df = apt_df.withColumn("location", clean_location(apt_df["location"]))

In [0]:
unique_beds = apt_df.select("bed_rooms").distinct().collect()
for beds in unique_beds:
    print(beds["bed_rooms"])

2 + den
1 + den
3
4 + den
0
3 + den
5+
bachelor/studio
1
4
2


In [0]:
@udf(returnType=IntegerType())
def clean_beds(beds):
    if 'bachelor' in beds:
        return 0
    if 'studio' in beds:
        return 0
    
    beds = beds.split("+")[0]
    return int(beds)

In [0]:
apt_df = apt_df.withColumn("bed_rooms", clean_beds(apt_df["bed_rooms"]))

In [0]:
unique_beds = apt_df.select("bed_rooms").distinct().collect()
for beds in unique_beds:
    print(beds["bed_rooms"])

1
3
5
4
2
0


In [0]:
from pyspark.sql.functions import year, month, to_date, datediff
from dateutil.relativedelta import relativedelta
from datetime import datetime, date, timedelta

In [0]:
unique_dates = apt_df.select("date_posted").distinct().collect()
for dates in unique_dates:
    print(dates["date_posted"])

< 19 minutes ago
< 9 hours ago
< 30 minutes ago
< 9 minutes ago
< 14 minutes ago
31/03/2023
< 7 hours ago
< 41 minutes ago
< 40 minutes ago
< 58 minutes ago
13/03/2023
< 23 minutes ago
< 59 minutes ago
< 31 minutes ago
< 35 minutes ago
None
< 7 minutes ago
< 52 minutes ago
< 8 minutes ago
< 6 hours ago
< 37 minutes ago
< 19 hours ago
< 26 minutes ago
26/03/2023
< 33 minutes ago
< 10 minutes ago
25/03/2023
11/03/2023
< 49 minutes ago
< 2 minutes ago
< 46 minutes ago
22/03/2023
< 51 minutes ago
< 22 hours ago
< 1 minute ago
< 22 minutes ago
< 5 hours ago
< 11 hours ago
< 16 hours ago
< 18 minutes ago
< 18 hours ago
23/03/2023
27/03/2023
27/02/2023
< 16 minutes ago
17/03/2023
< 57 minutes ago
04/04/2023
< 60 minutes ago
01/03/2023
< 2 hours ago
< 4 hours ago
< 21 hours ago
30/03/2023
< 39 minutes ago
< 17 hours ago
20/03/2023
18/03/2023
yesterday
< 29 minutes ago
< 3 hours ago
< 55 minutes ago
09/03/2023
< 12 hours ago
< 36 minutes ago
< 13 minutes ago
< 21 minutes ago
19/03/2023
< 10 hou

In [0]:
@udf(returnType=DateType())
def parse_date_string(date_string, date_=None):
    
    if str(date_string) == "None":
        return date_
    if date_ is None:
        now = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
    else:
        now = datetime.strptime(str(date_), '%Y-%m-%d').replace(hour=0, minute=0, second=0, microsecond=0)
    if date_string == 'yesterday':
        return (now - relativedelta(days=1)).date()
    elif 'minute' in date_string:
        return now.date()
    elif date_string.startswith('<'):
        # Convert relative time to absolute time
        minutes_ago = int(date_string.split()[1])
        return datetime.now() - timedelta(minutes=minutes_ago)
    else:
        return datetime.strptime(date_string, '%d/%m/%Y').date()

In [0]:
apt_df = apt_df.withColumn("date_posted_calculated", parse_date_string(apt_df["date_posted"], apt_df["scraped_on"]))

In [0]:
apt_df = apt_df.withColumn("year", year(apt_df["date_posted_calculated"]))\
               .withColumn("month", month(apt_df["date_posted_calculated"]))

In [0]:
apt_df.count()

Out[69]: 2295

In [0]:
apt_df = apt_df.withColumnRenamed('id', 'listing_id')\
       .withColumnRenamed('bed_rooms', 'bedrooms')\
       .withColumnRenamed('date_posted', 'post_date')\
       .withColumnRenamed('scraped_on', 'scrape_date')\
       .withColumnRenamed("date_posted_calculated", "calculated_date")


In [0]:
display(apt_df.limit(5))

listing_id,title,location,rent,bedrooms,post_date,url,scrape_date,calculated_date,year,month
1649856293,large luxury 2-bedroom toronto apartment south west facing,toronto,2799,2,01/04/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/large-luxury-2-bedroom-toronto-apartment-south-west-facing/1649856293,2023-04-06,2023-04-01,2023,4
1649856299,"1 month free! deluxe open concept 1 bedroom apartment, toronto",toronto,2349,1,< 15 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/1-month-free-deluxe-open-concept-1-bedroom-apartment-toronto/1649856299,2023-04-06,2023-04-06,2023,4
1653158018,"sunny, spacious 2 bdrm. circa 1898 edwardian roncy village",toronto,2595,2,< 22 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/sunny-spacious-2-bdrm-circa-1898-edwardian-roncy-village/1653158018,2023-04-06,2023-04-06,2023,4
1651533067,1 bedroom large renovated apartment for rent in toronto - 90 eas,toronto,2195,1,26/03/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/1-bedroom-large-renovated-apartment-for-rent-in-toronto-90-eas/1651533067,2023-04-06,2023-03-26,2023,3
1654957801,3bdrm+2 bath-penthouse (1660 sqft) at queen and spadina,toronto,3950,3,30/03/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/3bdrm-2-bath-penthouse-1660-sqft-at-queen-and-spadina/1654957801,2023-04-06,2023-03-30,2023,3


###### write the processed data back to ADLS

In [0]:
apt_df.write\
      .format('parquet')\
      .mode('append')\
      .partitionBy("location")\
      .save('abfs://processed@kijijidata.dfs.core.windows.net/apt/')


##### move processed data from raw to raw-processed

In [0]:
dbutils.fs.mv("abfs://raw@kijijidata.dfs.core.windows.net/apt/", "abfs://raw-processed@kijijidata.dfs.core.windows.net/apt/", recurse=True)