##### Access Azure data lake and read the data using
1. Databricks secretScope
2. Azure key-vault

In [0]:
dbutils.secrets.help()

In [0]:
dbutils.secrets.listScopes()

Out[85]: [SecretScope(name='kijiji-data-scope')]

In [0]:
dbutils.secrets.list(scope='kijiji-data-scope')

Out[86]: [SecretMetadata(key='account-key')]

In [0]:
account_key = dbutils.secrets.get(scope='kijiji-data-scope',key='account-key')

In [0]:
spark.conf.set(
    "fs.azure.account.key.kijijidata.dfs.core.windows.net",
    account_key
)

##### check the access to ADLS

In [0]:
display(dbutils.fs.ls("abfs://raw@kijijidata.dfs.core.windows.net"))

path,name,size,modificationTime
abfs://raw@kijijidata.dfs.core.windows.net/apt/,apt/,0,1681309808000
abfs://raw@kijijidata.dfs.core.windows.net/house/,house/,0,1680861678000


In [0]:
display(dbutils.fs.ls("abfs://raw@kijijidata.dfs.core.windows.net/apt/"))

path,name,size,modificationTime
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-01/,2023-04-01/,0,1681309808000
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-03/,2023-04-03/,0,1681309810000
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-04/,2023-04-04/,0,1681309811000
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-05/,2023-04-05/,0,1681309813000
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-06/,2023-04-06/,0,1681309815000
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-07/,2023-04-07/,0,1681309817000


In [0]:
display(dbutils.fs.ls("abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-06/"))

path,name,size,modificationTime
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-06/kijiji_gta_1_2023-04-06.csv,kijiji_gta_1_2023-04-06.csv,200503,1681311680000
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-06/kijiji_gta_21_2023-04-06.csv,kijiji_gta_21_2023-04-06.csv,80942,1681311680000
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-06/kijiji_gta_41_2023-04-06.csv,kijiji_gta_41_2023-04-06.csv,71027,1681311681000
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-06/kijiji_gta_61_2023-04-06.csv,kijiji_gta_61_2023-04-06.csv,90544,1681311680000
abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-06/kijiji_gta_81_2023-04-06.csv,kijiji_gta_81_2023-04-06.csv,81183,1681311680000


In [0]:
display(spark.read.csv("abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-06/kijiji_gta_1_2023-04-06.csv").limit(5))

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7
id,title,location,rent,bed_rooms,date_posted,url,scraped_on
1649856293,Large Luxury 2-Bedroom Toronto Apartment South West Facing,Toronto,"$2,799.00",2,01/04/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/large-luxury-2-bedroom-toronto-apartment-south-west-facing/1649856293,2023-04-06
1649856299,"1 MONTH FREE! Deluxe Open Concept 1 Bedroom Apartment, Toronto",Toronto,"$2,349.00",1,< 15 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/1-month-free-deluxe-open-concept-1-bedroom-apartment-toronto/1649856299,2023-04-06
1653158018,"Sunny, spacious 2 bdrm. circa 1898 Edwardian Roncy Village",Toronto,"$2,595.00",2,< 22 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/sunny-spacious-2-bdrm-circa-1898-edwardian-roncy-village/1653158018,2023-04-06
1651533067,1 Bedroom Large Renovated Apartment For Rent in Toronto - 90 Eas,Toronto,"$2,195.00",1,26/03/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/1-bedroom-large-renovated-apartment-for-rent-in-toronto-90-eas/1651533067,2023-04-06


###### Perform necessary transformations and save the result as parquet to ADLS
1. read apt listsing and house rental listings from adls
2. Specify Schema
3. drop null column
4. make title column smaller case
5. clean location - group same text to one, eq :- City of Toronto, Toronto --> toronto
6. convert rent column to integer and replace text please contact with 0
7. clean bed_rooms columns and cast to int
7. recalculate the date_posted column. eq :-  21 hours ago should be converted to proper date format
8. create two additional columns. eq :- year and month column

In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DateType
from pyspark.sql.functions import lower, udf, col, when, regexp_replace, trim, to_date, date_add
from dateutil.relativedelta import relativedelta

In [0]:
display(spark.read.csv("abfs://raw@kijijidata.dfs.core.windows.net/apt/2023-04-06/kijiji_gta_1_2023-04-06.csv").limit(5))

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7
id,title,location,rent,bed_rooms,date_posted,url,scraped_on
1649856293,Large Luxury 2-Bedroom Toronto Apartment South West Facing,Toronto,"$2,799.00",2,01/04/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/large-luxury-2-bedroom-toronto-apartment-south-west-facing/1649856293,2023-04-06
1649856299,"1 MONTH FREE! Deluxe Open Concept 1 Bedroom Apartment, Toronto",Toronto,"$2,349.00",1,< 15 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/1-month-free-deluxe-open-concept-1-bedroom-apartment-toronto/1649856299,2023-04-06
1653158018,"Sunny, spacious 2 bdrm. circa 1898 Edwardian Roncy Village",Toronto,"$2,595.00",2,< 22 hours ago,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/sunny-spacious-2-bdrm-circa-1898-edwardian-roncy-village/1653158018,2023-04-06
1651533067,1 Bedroom Large Renovated Apartment For Rent in Toronto - 90 Eas,Toronto,"$2,195.00",1,26/03/2023,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/1-bedroom-large-renovated-apartment-for-rent-in-toronto-90-eas/1651533067,2023-04-06


In [0]:
rental_schema = StructType(fields=[
                                StructField("id", IntegerType(), True),
                                StructField("title", StringType(), True),
                                StructField("location", StringType(), True),
                                StructField("rent", StringType(), True),
                                StructField("bed_rooms", StringType(), True),
                                StructField("date_posted", StringType(), True),
                                StructField("url", StringType(), True),
                                StructField("scraped_on", DateType(), True)
                            ])

In [0]:
apt_df = spark.read\
              .option("header", True)\
              .schema(rental_schema)\
              .csv("abfs://raw@kijijidata.dfs.core.windows.net/apt/**/*.csv")\
              .drop("null")

In [0]:
apt_df.count()

Out[97]: 9785

In [0]:
apt_df = apt_df.dropna(subset=["id"])

In [0]:
apt_df.count()

Out[99]: 9765

In [0]:
display(apt_df.limit(5))

id,title,location,rent,bed_rooms,date_posted,url,scraped_on
1653225832,1 Bedroom Apartment for Rent! Dufferin St./Lawrence Ave.!,Toronto,"$1,750.00",1,2023-03-13 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/1-bedroom-apartment-for-rent-dufferin-st-lawrence-ave/1653225832,2023-04-03
1652711690,Greenrock Davisville - Renovated 1 Bedroom Suite Available,TORONTO,"$2,500.00",1,2023-04-01 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/greenrock-davisville-renovated-1-bedroom-suite-available/1652711690,2023-04-03
1623036589,Brand New Luxury Two Bedroom Apartment Rental in Oakville!,Oakville / Halton Region,"$3,295.00",2,2023-03-23 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/oakville-halton-region/brand-new-luxury-two-bedroom-apartment-rental-in-oakville/1623036589,2023-04-03
1560281969,BUY HOUSE WITH ZERO DOWN OR RENT TO OWN PROGRAM,Oshawa / Durham Region,Please Contact,4,2023-03-30 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/oshawa-durham-region/buy-house-with-zero-down-or-rent-to-own-program/1560281969,2023-04-03
1653804711,Modern townhouse,Markham / York Region,"$2,800.00",2 + Den,2023-03-19 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/markham-york-region/modern-townhouse/1653804711,2023-04-03


In [0]:
apt_df = apt_df.withColumn("title", lower(apt_df["title"]))\
               .withColumn("date_posted", lower(apt_df["date_posted"]))\
               .withColumn("location", when(col("location").isNull(), "Not Available").otherwise(lower(col("location"))))\
               .withColumn("rent", when(col("rent") == "Please Contact", 0).otherwise(col("rent")))\
               .withColumn("rent", trim(regexp_replace(col("rent"), "[$,]", "")).cast("decimal(10,2)"))\
               .withColumn("bed_rooms", when(col("bed_rooms").isNull(), 0).otherwise(lower(col("bed_rooms"))))

In [0]:
display(apt_df.limit(5))

id,title,location,rent,bed_rooms,date_posted,url,scraped_on
1653225832,1 bedroom apartment for rent! dufferin st./lawrence ave.!,toronto,1750.0,1,2023-03-13 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/1-bedroom-apartment-for-rent-dufferin-st-lawrence-ave/1653225832,2023-04-03
1652711690,greenrock davisville - renovated 1 bedroom suite available,toronto,2500.0,1,2023-04-01 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/greenrock-davisville-renovated-1-bedroom-suite-available/1652711690,2023-04-03
1623036589,brand new luxury two bedroom apartment rental in oakville!,oakville / halton region,3295.0,2,2023-03-23 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/oakville-halton-region/brand-new-luxury-two-bedroom-apartment-rental-in-oakville/1623036589,2023-04-03
1560281969,buy house with zero down or rent to own program,oshawa / durham region,0.0,4,2023-03-30 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/oshawa-durham-region/buy-house-with-zero-down-or-rent-to-own-program/1560281969,2023-04-03
1653804711,modern townhouse,markham / york region,2800.0,2 + den,2023-03-19 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/markham-york-region/modern-townhouse/1653804711,2023-04-03


In [0]:
apt_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- location: string (nullable = true)
 |-- rent: decimal(10,2) (nullable = true)
 |-- bed_rooms: string (nullable = true)
 |-- date_posted: string (nullable = true)
 |-- url: string (nullable = true)
 |-- scraped_on: date (nullable = true)



In [0]:
for location in unique_locations:
    print(location)

Row(location='kitchener')
Row(location='town of caledon')
Row(location='concord')
Row(location='acton')
Row(location='oshawa / durham region')
Row(location='orangeville')
Row(location='markham / york region')
Row(location='meadowvale')
Row(location='roseneath')
Row(location='ashburn')
Row(location='bradford')
Row(location='caledon')
Row(location='toronto')
Row(location='burlington')
Row(location='sutton')
Row(location='schomberg')
Row(location='erin')
Row(location='milton')
Row(location='cobourg')
Row(location='nobleton')
Row(location='city of toronto')
Row(location='brampton')
Row(location='township of guelph/eramosa')
Row(location='mississauga')
Row(location='churchville')
Row(location='east york')
Row(location='kettleby')
Row(location='vaughan')
Row(location='stouffville')
Row(location='mineola')
Row(location='north yorks')
Row(location='toronto m9m 0b5')
Row(location='etobicoke')
Row(location='whitchurch-stouffville')
Row(location='eden mills')
Row(location='erin mills')
Row(locati

In [0]:
apt_df[apt_df["location"]=="city of toronto"].show(5)

+----------+--------------------+---------------+-------+---------+--------------------+--------------------+----------+
|        id|               title|       location|   rent|bed_rooms|         date_posted|                 url|scraped_on|
+----------+--------------------+---------------+-------+---------+--------------------+--------------------+----------+
|1649856271|      1 bdrm and den|city of toronto|2615.00|  1 + den|2023-04-03 00:00:...|https://www.kijij...|2023-04-03|
|1640353318|1 bdrm and den- k...|city of toronto|2850.00|  1 + den|2023-04-03 00:00:...|https://www.kijij...|2023-04-03|
|1649220565|1 bdrm + den +din...|city of toronto|3220.00|        1|2023-04-03 00:00:...|https://www.kijij...|2023-04-03|
|1655388109|beautiful 3 bedro...|city of toronto|3400.00|        3|2023-04-03 00:00:...|https://www.kijij...|2023-04-03|
|1526047287|spacious 1 bed + ...|city of toronto|2495.00|  1 + den|2023-04-03 00:00:...|https://www.kijij...|2023-04-03|
+----------+--------------------

In [0]:
@udf(returnType=StringType())
def clean_location(location):
    # split the location text by / and return first element
    # eq : mississauga / peel region - > mississauga
    location_dict = {"city of toronto":"toronto"}
    if str(location) == "None":
        return "N/A"
    location = location.split("/")[0]
    if location in location_dict:
        return location_dict[location]
    return location.strip()

In [0]:
apt_df = apt_df.withColumn("location", clean_location(apt_df["location"]))

In [0]:
display(apt_df.limit(5))

id,title,location,rent,bed_rooms,date_posted,url,scraped_on
1653225832,1 bedroom apartment for rent! dufferin st./lawrence ave.!,toronto,1750.0,1,2023-03-13 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/1-bedroom-apartment-for-rent-dufferin-st-lawrence-ave/1653225832,2023-04-03
1652711690,greenrock davisville - renovated 1 bedroom suite available,toronto,2500.0,1,2023-04-01 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/greenrock-davisville-renovated-1-bedroom-suite-available/1652711690,2023-04-03
1623036589,brand new luxury two bedroom apartment rental in oakville!,oakville,3295.0,2,2023-03-23 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/oakville-halton-region/brand-new-luxury-two-bedroom-apartment-rental-in-oakville/1623036589,2023-04-03
1560281969,buy house with zero down or rent to own program,oshawa,0.0,4,2023-03-30 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/oshawa-durham-region/buy-house-with-zero-down-or-rent-to-own-program/1560281969,2023-04-03
1653804711,modern townhouse,markham,2800.0,2 + den,2023-03-19 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/markham-york-region/modern-townhouse/1653804711,2023-04-03


In [0]:
unique_beds = apt_df.select("bed_rooms").distinct().limit(100).collect()
for beds in unique_beds:
    print(beds["bed_rooms"])

2 + den
1 + den
3
4 + den
0
3 + den
5+
bachelor/studio
1
4
2


In [0]:
@udf(returnType=IntegerType())
def clean_beds(beds):
    if 'bachelor' in beds:
        return 0
    if 'studio' in beds:
        return 0
    
    beds = beds.split("+")[0]
    return int(beds)

In [0]:
apt_df = apt_df.withColumn("bed_rooms", clean_beds(apt_df["bed_rooms"]))

In [0]:
display(apt_df.limit(5))

id,title,location,rent,bed_rooms,date_posted,url,scraped_on
1653225832,1 bedroom apartment for rent! dufferin st./lawrence ave.!,toronto,1750.0,1,2023-03-13 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/1-bedroom-apartment-for-rent-dufferin-st-lawrence-ave/1653225832,2023-04-03
1652711690,greenrock davisville - renovated 1 bedroom suite available,toronto,2500.0,1,2023-04-01 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/greenrock-davisville-renovated-1-bedroom-suite-available/1652711690,2023-04-03
1623036589,brand new luxury two bedroom apartment rental in oakville!,oakville,3295.0,2,2023-03-23 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/oakville-halton-region/brand-new-luxury-two-bedroom-apartment-rental-in-oakville/1623036589,2023-04-03
1560281969,buy house with zero down or rent to own program,oshawa,0.0,4,2023-03-30 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/oshawa-durham-region/buy-house-with-zero-down-or-rent-to-own-program/1560281969,2023-04-03
1653804711,modern townhouse,markham,2800.0,2,2023-03-19 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/markham-york-region/modern-townhouse/1653804711,2023-04-03


In [0]:
unique_beds = apt_df.select("bed_rooms").distinct().limit(10).collect()
for beds in unique_beds:
    print(beds["bed_rooms"])

1
3
5
4
2
0


In [0]:
from pyspark.sql.functions import year, month, to_date, datediff
from dateutil.relativedelta import relativedelta
from datetime import datetime, date, timedelta

In [0]:
unique_dates = apt_df.select("date_posted").distinct().collect()
for dates in unique_dates:
    print(dates["date_posted"])

< 48 minutes ago
2023-04-03 21:24:15.437
2023-03-30 00:00:00.000
2023-03-24 00:00:00.000
08/03/2023
2023-04-01 17:05:53.262
2023-03-28 00:00:00.000
< 19 minutes ago
< 9 hours ago
2023-04-01 17:20:48.979
< 30 minutes ago
2023-04-01 17:06:48.979
2023-04-01 17:21:48.980
< 9 minutes ago
< 14 minutes ago
2023-03-04 00:00:00.000
2023-03-02 00:00:00.000
2023-03-20 00:00:00.000
31/03/2023
2023-04-03 21:17:08.548
2023-04-03 21:16:08.548
2023-04-03 21:04:15.434
2023-04-03 21:18:15.436
< 6 minutes ago
2023-04-01 17:11:53.262
2023-02-22 00:00:00.000
2023-04-03 21:13:08.549
< 7 hours ago
29/03/2023
< 34 minutes ago
2023-04-01 17:16:48.980
2023-04-01 17:11:48.981
2023-04-01 17:04:48.980
< 28 minutes ago
< 41 minutes ago
2023-04-01 17:20:48.980
< 40 minutes ago
< 58 minutes ago
2023-03-07 00:00:00.000
13/03/2023
2023-04-01 17:23:53.262
< 23 minutes ago
2023-02-27 00:00:00.000
2023-04-03 21:19:08.547
2023-04-01 17:04:48.979
2023-04-01 17:01:48.980
< 59 minutes ago
2023-04-01 17:22:48.979
< 31 minutes 

In [0]:
from dateutil.parser import parse
@udf(returnType=DateType())
def parse_date_string(date_string, date_=None):
    
    date_string = str(date_string)
    if date_string == "None":
        return date_

    if "/" in date_string:
        date_obj = datetime.strptime(date_string, '%d/%m/%Y')
        date_string = date_obj.strftime('%Y-%m-%d')
    elif "-" in date_string:
        try:
            date_obj = datetime.strptime(date_string, '%Y-%m-%d %H:%M:%S.%f')
            date_string = date_obj.strftime('%Y-%m-%d')
        except:
            date_obj = datetime.strptime(date_string, '%Y-%m-%d')
            date_string = date_obj.strftime('%Y-%m-%d')
    
    if date_ is None:
        now = datetime.now().date()
    else:
        now = datetime.strptime(str(date_), '%Y-%m-%d')
    
    if date_string == 'yesterday':
        return (now - relativedelta(days=1)).date()
    
    elif 'minute' in date_string:
        return now.date()
    
    elif date_string.startswith('<'):
        # Convert relative time to absolute time
        minutes_ago = int(date_string.split()[1])
        return datetime.now() - timedelta(minutes=minutes_ago)
    
    else:
        return datetime.strptime(date_string, '%Y-%m-%d').date()

In [0]:
apt_df = apt_df.withColumn("date_posted_calculated", parse_date_string(apt_df["date_posted"], apt_df["scraped_on"]))

In [0]:
apt_df = apt_df.withColumn("year", year(apt_df["date_posted_calculated"]))\
               .withColumn("month", month(apt_df["date_posted_calculated"]))

In [0]:
apt_df.count()

Out[121]: 9765

In [0]:
apt_df = apt_df.withColumnRenamed('id', 'listing_id')\
       .withColumnRenamed('bed_rooms', 'bedrooms')\
       .withColumnRenamed('date_posted', 'post_date')\
       .withColumnRenamed('scraped_on', 'scrape_date')\
       .withColumnRenamed("date_posted_calculated", "calculated_date")


In [0]:
display(apt_df.limit(5))

listing_id,title,location,rent,bedrooms,post_date,url,scrape_date,calculated_date,year,month
1653225832,1 bedroom apartment for rent! dufferin st./lawrence ave.!,toronto,1750.0,1,2023-03-13 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/1-bedroom-apartment-for-rent-dufferin-st-lawrence-ave/1653225832,2023-04-03,2023-03-13,2023,3
1652711690,greenrock davisville - renovated 1 bedroom suite available,toronto,2500.0,1,2023-04-01 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/greenrock-davisville-renovated-1-bedroom-suite-available/1652711690,2023-04-03,2023-04-01,2023,4
1623036589,brand new luxury two bedroom apartment rental in oakville!,oakville,3295.0,2,2023-03-23 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/oakville-halton-region/brand-new-luxury-two-bedroom-apartment-rental-in-oakville/1623036589,2023-04-03,2023-03-23,2023,3
1560281969,buy house with zero down or rent to own program,oshawa,0.0,4,2023-03-30 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/oshawa-durham-region/buy-house-with-zero-down-or-rent-to-own-program/1560281969,2023-04-03,2023-03-30,2023,3
1653804711,modern townhouse,markham,2800.0,2,2023-03-19 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/markham-york-region/modern-townhouse/1653804711,2023-04-03,2023-03-19,2023,3


###### write the processed data back to ADLS

In [0]:
apt_df.write\
      .format('parquet')\
      .mode('append')\
      .partitionBy("location")\
      .save('abfs://processed@kijijidata.dfs.core.windows.net/apt/')


##### move processed data from raw to raw-processed

In [0]:
dbutils.fs.mv("abfs://raw@kijijidata.dfs.core.windows.net/apt/", "abfs://raw-processed@kijijidata.dfs.core.windows.net/apt/", recurse=True)

Out[125]: True