##### Access Azure data lake and read the data using
1. Databricks secretScope
2. Azure key-vault

In [0]:
dbutils.secrets.help()

In [0]:
dbutils.secrets.listScopes()

Out[41]: [SecretScope(name='kijiji-scope')]

In [0]:
dbutils.secrets.list(scope='kijiji-scope')

Out[42]: [SecretMetadata(key='account-key')]

In [0]:
account_key = dbutils.secrets.get(scope='kijiji-scope',key='account-key')

In [0]:
spark.conf.set(
    "fs.azure.account.key.kijijidata.dfs.core.windows.net",
    account_key
)

##### check the access to ADLS

In [0]:
display(dbutils.fs.ls("abfs://raw@kijijidata.dfs.core.windows.net"))

path,name,size,modificationTime
abfs://raw@kijijidata.dfs.core.windows.net/house/,house/,0,1680633487000


In [0]:
display(dbutils.fs.ls("abfs://raw@kijijidata.dfs.core.windows.net/house/"))

path,name,size,modificationTime
abfs://raw@kijijidata.dfs.core.windows.net/house/2023-04-04/,2023-04-04/,0,1680638992000
abfs://raw@kijijidata.dfs.core.windows.net/house/2023-04-05/,2023-04-05/,0,1680688976000


In [0]:
display(dbutils.fs.ls("abfs://raw@kijijidata.dfs.core.windows.net/house/2023-04-04/"))

path,name,size,modificationTime
abfs://raw@kijijidata.dfs.core.windows.net/house/2023-04-04/kijiji_house_1_2023-04-04.csv,kijiji_house_1_2023-04-04.csv,142464,1680638914000
abfs://raw@kijijidata.dfs.core.windows.net/house/2023-04-04/kijiji_house_21_2023-04-04.csv,kijiji_house_21_2023-04-04.csv,10189,1680638962000
abfs://raw@kijijidata.dfs.core.windows.net/house/2023-04-04/kijiji_house_41_2023-04-04.csv,kijiji_house_41_2023-04-04.csv,37500,1680638987000
abfs://raw@kijijidata.dfs.core.windows.net/house/2023-04-04/kijiji_house_61_2023-04-04.csv,kijiji_house_61_2023-04-04.csv,37469,1680638992000
abfs://raw@kijijidata.dfs.core.windows.net/house/2023-04-04/kijiji_house_81_2023-04-04.csv,kijiji_house_81_2023-04-04.csv,43418,1680638989000


In [0]:
display(spark.read.csv("abfs://raw@kijijidata.dfs.core.windows.net/house/2023-04-04/kijiji_house_21_2023-04-04.csv"))

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8
,id,title,location,rent,bed_rooms,date_posted,url,scraped_on
0.0,1652036258,Ranch Style Bungalow Walk Out Basement - For Rent!,Caledon,"$1,900.00",1 + Den,21/03/2023,https://www.kijiji.ca/v-apartments-condos/oakville-halton-region/ranch-style-bungalow-walk-out-basement-for-rent/1652036258,2023-04-04
1.0,1572249157,BUY HOUSE WITH OUR EXCLUSIVE $ 0 DOWN OR RENT TO OWN PROGRAMS,Mississauga / Peel Region,Please Contact,4,02/04/2023,https://www.kijiji.ca/v-apartments-condos/mississauga-peel-region/buy-house-with-our-exclusive-0-down-or-rent-to-own-programs/1572249157,2023-04-04
2.0,1654339130,"Main floor-bright, renovated, quiet court-May 1",Toronto,"$2,750.00",2 + Den,< 24 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/main-floor-bright-renovated-quiet-court-may-1/1654339130,2023-04-04
3.0,1569609232,RENT TO OWN A HOUSE OR BUY WITH OUR ZERO DOWN PROGRAM,City of Toronto,Please Contact,3,02/04/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/rent-to-own-a-house-or-buy-with-our-zero-down-program/1569609232,2023-04-04
4.0,1655040573,Bright and clean 3+1 Br semi in Bronte Oakville & steps to lake,Oakville,"$3,700.00",3,31/03/2023,https://www.kijiji.ca/v-apartments-condos/oakville-halton-region/bright-and-clean-3-1-br-semi-in-bronte-oakville-steps-to-lake/1655040573,2023-04-04
5.0,1651462381,3 Bedroom House Available for Rent at Prime Location,Brampton,Please Contact,3,24/02/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/3-bedroom-house-available-for-rent-at-prime-location/1651462381,2023-04-04
6.0,1651438543,4 Bedrooms 4 bath Fully Detached House for Rent,Brampton,"$3,500.00",4,24/02/2023,https://www.kijiji.ca/v-apartments-condos/oakville-halton-region/4-bedrooms-4-bath-fully-detached-house-for-rent/1651438543,2023-04-04
7.0,1651427355,Spacious detached home with separate garage downtown unfurnished,Toronto,"$4,500.00",4,24/02/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/spacious-detached-home-with-separate-garage-downtown-unfurnished/1651427355,2023-04-04
8.0,1651426445,Beautiful Modern Semi-detached 3 + Bedroom Caledon Home for Rent,Caledon,"$3,000.00",3 + Den,24/02/2023,https://www.kijiji.ca/v-apartments-condos/mississauga-peel-region/beautiful-modern-semi-detached-3-bedroom-caledon-home-for-rent/1651426445,2023-04-04


###### Perform necessary transformations and save the result as parquet to ADLS
1. read apt listsing and house rental listings from adls
2. Specify Schema
3. drop null column
4. make title column smaller case
5. clean location - group same text to one, eq :- City of Toronto, Toronto --> toronto
6. convert rent column to integer and replace text please contact with 0
7. clean bed_rooms columns and cast to int
7. recalculate the date_posted column. eq :-  21 hours ago should be converted to proper date format
8. create two additional columns. eq :- year and month column

In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DateType
from pyspark.sql.functions import lower, udf, col, when, regexp_replace, trim, to_date, date_add
from dateutil.relativedelta import relativedelta

In [0]:
# Set the timeParserPolicy to LEGACY
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [0]:
rental_schema = StructType(fields=[
                                StructField("null", IntegerType(), True),
                                StructField("id", IntegerType(), True),
                                StructField("title", StringType(), True),
                                StructField("location", StringType(), True),
                                StructField("rent", StringType(), True),
                                StructField("bed_rooms", StringType(), True),
                                StructField("date_posted", StringType(), True),
                                StructField("url", StringType(), True),
                                StructField("scraped_on", DateType(), True)
                            ])

In [0]:
house_df = spark.read\
              .option("header", True)\
              .schema(rental_schema)\
              .csv("abfs://raw@kijijidata.dfs.core.windows.net/house/**/*.csv")\
              .drop("null")

In [0]:
house_df.count()

Out[68]: 2450

In [0]:
display(house_df)

id,title,location,rent,bed_rooms,date_posted,url,scraped_on
1560281969.0,BUY HOUSE WITH ZERO DOWN OR RENT TO OWN PROGRAM,Oshawa / Durham Region,Please Contact,4,30/03/2023,https://www.kijiji.ca/v-apartments-condos/oshawa-durham-region/buy-house-with-zero-down-or-rent-to-own-program/1560281969,2023-04-05
1652711864.0,Gorgeous 4 Bdrm 4 Baths Detached House for Lease in Bradford,Bradford,"$3,500.00",4,31/03/2023,https://www.kijiji.ca/v-apartments-condos/markham-york-region/gorgeous-4-bdrm-4-baths-detached-house-for-lease-in-bradford/1652711864,2023-04-05
1572249157.0,BUY HOUSE WITH OUR EXCLUSIVE $ 0 DOWN OR RENT TO OWN PROGRAMS,Mississauga / Peel Region,Please Contact,4,02/04/2023,https://www.kijiji.ca/v-apartments-condos/mississauga-peel-region/buy-house-with-our-exclusive-0-down-or-rent-to-own-programs/1572249157,2023-04-05
1569609232.0,RENT TO OWN A HOUSE OR BUY WITH OUR ZERO DOWN PROGRAM,City of Toronto,Please Contact,3,02/04/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/rent-to-own-a-house-or-buy-with-our-zero-down-program/1569609232,2023-04-05
1568088018.0,BUY HOUSE WITH ZERO DOWN OR RENT TO OWN PROGRAM,Markham / York Region,Please Contact,4,14/03/2023,https://www.kijiji.ca/v-apartments-condos/markham-york-region/buy-house-with-zero-down-or-rent-to-own-program/1568088018,2023-04-05
1653373622.0,Executive Heritage Suite - Rosedale/Moore Park,City of Toronto,"$4,100.00",3,< 8 minutes ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/executive-heritage-suite-rosedale-moore-park/1653373622,2023-04-05
1654804696.0,"Modern, Bright & Spacious 3-BR Victorian House in Vibrant Annex/",City of Toronto,"$5,850.00",3 + Den,< 8 minutes ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/modern-bright-spacious-3-br-victorian-house-in-vibrant-annex/1654804696,2023-04-05
1655528202.0,Toronto Prime Beaches 3 bed 2 bath home south of queen!!,Toronto,"$4,200.00",3,< 2 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/toronto-prime-beaches-3-bed-2-bath-home-south-of-queen/1655528202,2023-04-05
1633306682.0,House for rent in Georgina RENOVATED HIGH CEILING 4BR 3bath,Georgina,"$3,400.00",4,< 4 hours ago,https://www.kijiji.ca/v-apartments-condos/markham-york-region/house-for-rent-in-georgina-renovated-high-ceiling-4br-3bath/1633306682,2023-04-05
1655526036.0,2 bedrooms apartment in basement for rent,Brampton,"$2,000.00",2,< 5 hours ago,https://www.kijiji.ca/v-apartments-condos/mississauga-peel-region/2-bedrooms-apartment-in-basement-for-rent/1655526036,2023-04-05


In [0]:
house_df = house_df.dropna(subset=["id"])

In [0]:
house_df.count()

Out[71]: 2448

In [0]:
display(house_df)

id,title,location,rent,bed_rooms,date_posted,url,scraped_on
1560281969,BUY HOUSE WITH ZERO DOWN OR RENT TO OWN PROGRAM,Oshawa / Durham Region,Please Contact,4,30/03/2023,https://www.kijiji.ca/v-apartments-condos/oshawa-durham-region/buy-house-with-zero-down-or-rent-to-own-program/1560281969,2023-04-05
1652711864,Gorgeous 4 Bdrm 4 Baths Detached House for Lease in Bradford,Bradford,"$3,500.00",4,31/03/2023,https://www.kijiji.ca/v-apartments-condos/markham-york-region/gorgeous-4-bdrm-4-baths-detached-house-for-lease-in-bradford/1652711864,2023-04-05
1572249157,BUY HOUSE WITH OUR EXCLUSIVE $ 0 DOWN OR RENT TO OWN PROGRAMS,Mississauga / Peel Region,Please Contact,4,02/04/2023,https://www.kijiji.ca/v-apartments-condos/mississauga-peel-region/buy-house-with-our-exclusive-0-down-or-rent-to-own-programs/1572249157,2023-04-05
1569609232,RENT TO OWN A HOUSE OR BUY WITH OUR ZERO DOWN PROGRAM,City of Toronto,Please Contact,3,02/04/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/rent-to-own-a-house-or-buy-with-our-zero-down-program/1569609232,2023-04-05
1568088018,BUY HOUSE WITH ZERO DOWN OR RENT TO OWN PROGRAM,Markham / York Region,Please Contact,4,14/03/2023,https://www.kijiji.ca/v-apartments-condos/markham-york-region/buy-house-with-zero-down-or-rent-to-own-program/1568088018,2023-04-05
1653373622,Executive Heritage Suite - Rosedale/Moore Park,City of Toronto,"$4,100.00",3,< 8 minutes ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/executive-heritage-suite-rosedale-moore-park/1653373622,2023-04-05
1654804696,"Modern, Bright & Spacious 3-BR Victorian House in Vibrant Annex/",City of Toronto,"$5,850.00",3 + Den,< 8 minutes ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/modern-bright-spacious-3-br-victorian-house-in-vibrant-annex/1654804696,2023-04-05
1655528202,Toronto Prime Beaches 3 bed 2 bath home south of queen!!,Toronto,"$4,200.00",3,< 2 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/toronto-prime-beaches-3-bed-2-bath-home-south-of-queen/1655528202,2023-04-05
1633306682,House for rent in Georgina RENOVATED HIGH CEILING 4BR 3bath,Georgina,"$3,400.00",4,< 4 hours ago,https://www.kijiji.ca/v-apartments-condos/markham-york-region/house-for-rent-in-georgina-renovated-high-ceiling-4br-3bath/1633306682,2023-04-05
1655526036,2 bedrooms apartment in basement for rent,Brampton,"$2,000.00",2,< 5 hours ago,https://www.kijiji.ca/v-apartments-condos/mississauga-peel-region/2-bedrooms-apartment-in-basement-for-rent/1655526036,2023-04-05


In [0]:
house_df = house_df.withColumn("title", lower(house_df["title"]))\
               .withColumn("date_posted", lower(house_df["date_posted"]))\
               .withColumn("location", when(col("location").isNull(), "Not Available").otherwise(lower(col("location"))))\
               .withColumn("rent", when(col("rent") == "Please Contact", 0).otherwise(col("rent")))\
               .withColumn("rent", trim(regexp_replace(col("rent"), "[$,]", "")).cast("int"))\
               .withColumn("bed_rooms", when(col("bed_rooms").isNull(), 0).otherwise(lower(col("bed_rooms"))))

In [0]:
unique_locations = house_df.select("location").distinct().collect()

In [0]:
for location in unique_locations:
    print(location["location"])

concord
oshawa / durham region
orangeville
markham / york region
queensville
roseneath
bradford
caledon
toronto
burlington
erin
milton
cobourg
blackstock
city of toronto
brampton
mississauga
churchville
vaughan
orillia
etobicoke
whitchurch-stouffville
erin mills
newmarket
oshawa
georgetown
ajax
bolton
new tecumseth
markham
hamilton
whitby
mississauga / peel region
bowmanville
deerfield
mount albert
richmond hill
halton hills
township of melancthon
east gwillimbury
norval
thornhill
newcastle
oakville / halton region
georgina
west oak trails
king city
courtice
valley creek
rockwood
scarborough
regional municipality of york
old toronto
Not Available
hampton
oakville
campbellville
aurora
pickering
windfields
newtonville
bradford west gwillimbury
caledon east
ashburn
streetsville
woodbridge
spring valley


In [0]:
house_df[house_df["location"]=="city of toronto"].show()

+----------+--------------------+---------------+----+---------+---------------+--------------------+----------+
|        id|               title|       location|rent|bed_rooms|    date_posted|                 url|scraped_on|
+----------+--------------------+---------------+----+---------+---------------+--------------------+----------+
|1569609232|rent to own a hou...|city of toronto|   0|        3|     02/04/2023|https://www.kijij...|2023-04-05|
|1653373622|executive heritag...|city of toronto|4100|        3|< 8 minutes ago|https://www.kijij...|2023-04-05|
|1654804696|modern, bright & ...|city of toronto|5850|  3 + den|< 8 minutes ago|https://www.kijij...|2023-04-05|
|1655525081| house for rent 2800|city of toronto|2800|  3 + den|  < 6 hours ago|https://www.kijij...|2023-04-05|
|1655519842|3 bedroom house i...|city of toronto|2850|        3|  < 8 hours ago|https://www.kijij...|2023-04-05|
|1655510231|female roomate wa...|city of toronto|1200|        4| < 10 hours ago|https://www.kiji

In [0]:
@udf(returnType=StringType())
def clean_location(location):
    # split the location text by / and return first element
    # eq : mississauga / peel region - > mississauga
    location_dict = {"city of toronto":"toronto"}
    if str(location) == "None":
        return "N/A"
    location = location.split("/")[0]
    if location in location_dict:
        return location_dict[location]
    return location.strip()

In [0]:
house_df = house_df.withColumn("location", clean_location(house_df["location"]))

In [0]:
unique_beds = house_df.select("bed_rooms").distinct().collect()
for beds in unique_beds:
    print(beds["bed_rooms"])

2 + den
1 + den
3
4 + den
0
3 + den
5+
bachelor/studio
1
4
2


In [0]:
@udf(returnType=IntegerType())
def clean_beds(beds):
    if 'bachelor' in beds:
        return 0
    if 'studio' in beds:
        return 0
    
    beds = beds.split("+")[0]
    return int(beds)

In [0]:
house_df = house_df.withColumn("bed_rooms", clean_beds(house_df["bed_rooms"]))

In [0]:
unique_beds = house_df.select("bed_rooms").distinct().collect()
for beds in unique_beds:
    print(beds["bed_rooms"])

1
3
5
4
2
0


In [0]:
from pyspark.sql.functions import year, month, to_date, datediff
from dateutil.relativedelta import relativedelta
from datetime import datetime, date, timedelta

In [0]:
unique_dates = house_df.select("date_posted").distinct().collect()
for dates in unique_dates:
    print(dates["date_posted"])

< 48 minutes ago
< 9 hours ago
31/03/2023
05/02/2023
< 7 hours ago
29/03/2023
< 31 minutes ago
None
< 8 minutes ago
< 6 hours ago
< 19 hours ago
< 20 hours ago
< 22 hours ago
14/03/2023
< 5 hours ago
21/03/2023
< 11 hours ago
< 16 hours ago
< 18 hours ago
27/03/2023
04/02/2023
< 16 minutes ago
< 44 minutes ago
07/02/2023
< 60 minutes ago
06/02/2023
< 2 hours ago
< 4 hours ago
< 21 hours ago
30/03/2023
< 17 hours ago
yesterday
< 3 hours ago
< 12 hours ago
< 23 hours ago
< 36 minutes ago
< 10 hours ago
28/03/2023
< 24 hours ago
< 15 hours ago
< 8 hours ago
< 3 minutes ago
02/04/2023
< 54 minutes ago
< 14 hours ago
03/04/2023
01/04/2023
< 13 hours ago
03/02/2023
23/02/2023
24/02/2023
19/02/2023
26/03/2023
25/03/2023
22/02/2023
21/02/2023
16/02/2023
17/02/2023
18/02/2023


In [0]:
@udf(returnType=DateType())
def parse_date_string(date_string, date_=None):
    
    if str(date_string) == "None":
        return date_
    if date_ is None:
        now = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
    else:
        now = datetime.strptime(str(date_), '%Y-%m-%d').replace(hour=0, minute=0, second=0, microsecond=0)
    if date_string == 'yesterday':
        return (now - relativedelta(days=1)).date()
    elif 'minute' in date_string:
        return now.date()
    elif date_string.startswith('<'):
        # Convert relative time to absolute time
        minutes_ago = int(date_string.split()[1])
        return datetime.now() - timedelta(minutes=minutes_ago)
    else:
        return datetime.strptime(date_string, '%d/%m/%Y').date()

In [0]:
house_df = house_df.withColumn("date_posted_calculated", parse_date_string(house_df["date_posted"], house_df["scraped_on"]))

In [0]:
house_df = house_df.withColumn("year", year(house_df["date_posted_calculated"]))\
               .withColumn("month", month(house_df["date_posted_calculated"]))

In [0]:
house_df.count()

Out[88]: 2448

In [0]:
house_df = house_df.withColumnRenamed('id', 'listing_id')\
       .withColumnRenamed('bed_rooms', 'bedrooms')\
       .withColumnRenamed('date_posted', 'post_date')\
       .withColumnRenamed('scraped_on', 'scrape_date')\
       .withColumnRenamed("date_posted_calculated", "calculated_date")


In [0]:
display(house_df)

listing_id,title,location,rent,bedrooms,post_date,url,scrape_date,calculated_date,year,month
1560281969,buy house with zero down or rent to own program,oshawa,0.0,4,30/03/2023,https://www.kijiji.ca/v-apartments-condos/oshawa-durham-region/buy-house-with-zero-down-or-rent-to-own-program/1560281969,2023-04-05,2023-03-30,2023.0,3.0
1652711864,gorgeous 4 bdrm 4 baths detached house for lease in bradford,bradford,3500.0,4,31/03/2023,https://www.kijiji.ca/v-apartments-condos/markham-york-region/gorgeous-4-bdrm-4-baths-detached-house-for-lease-in-bradford/1652711864,2023-04-05,2023-03-31,2023.0,3.0
1572249157,buy house with our exclusive $ 0 down or rent to own programs,mississauga,0.0,4,02/04/2023,https://www.kijiji.ca/v-apartments-condos/mississauga-peel-region/buy-house-with-our-exclusive-0-down-or-rent-to-own-programs/1572249157,2023-04-05,2023-04-02,2023.0,4.0
1569609232,rent to own a house or buy with our zero down program,toronto,0.0,3,02/04/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/rent-to-own-a-house-or-buy-with-our-zero-down-program/1569609232,2023-04-05,2023-04-02,2023.0,4.0
1568088018,buy house with zero down or rent to own program,markham,0.0,4,14/03/2023,https://www.kijiji.ca/v-apartments-condos/markham-york-region/buy-house-with-zero-down-or-rent-to-own-program/1568088018,2023-04-05,2023-03-14,2023.0,3.0
1653373622,executive heritage suite - rosedale/moore park,toronto,4100.0,3,< 8 minutes ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/executive-heritage-suite-rosedale-moore-park/1653373622,2023-04-05,2023-04-05,2023.0,4.0
1654804696,"modern, bright & spacious 3-br victorian house in vibrant annex/",toronto,5850.0,3,< 8 minutes ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/modern-bright-spacious-3-br-victorian-house-in-vibrant-annex/1654804696,2023-04-05,2023-04-05,2023.0,4.0
1655528202,toronto prime beaches 3 bed 2 bath home south of queen!!,toronto,4200.0,3,< 2 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/toronto-prime-beaches-3-bed-2-bath-home-south-of-queen/1655528202,2023-04-05,2023-04-05,2023.0,4.0
1633306682,house for rent in georgina renovated high ceiling 4br 3bath,georgina,3400.0,4,< 4 hours ago,https://www.kijiji.ca/v-apartments-condos/markham-york-region/house-for-rent-in-georgina-renovated-high-ceiling-4br-3bath/1633306682,2023-04-05,2023-04-05,2023.0,4.0
1655526036,2 bedrooms apartment in basement for rent,brampton,2000.0,2,< 5 hours ago,https://www.kijiji.ca/v-apartments-condos/mississauga-peel-region/2-bedrooms-apartment-in-basement-for-rent/1655526036,2023-04-05,2023-04-05,2023.0,4.0


###### write the processed data back to ADLS

In [0]:
house_df.write\
      .format('parquet')\
      .mode('overwrite')\
      .partitionBy("location")\
      .save('abfs://processed@kijijidata.dfs.core.windows.net/house/')


##### move processed data from raw to raw-processed

In [0]:
dbutils.fs.mv("abfs://raw@kijijidata.dfs.core.windows.net/house/", "abfs://raw-processed@kijijidata.dfs.core.windows.net/house/", recurse=True)

Out[93]: True