##### Access Azure data lake and read the data using
1. Databricks secretScope
2. Azure key-vault

In [0]:
account_key = dbutils.secrets.get(scope='kijiji-data-scope',key='account-key')

In [0]:
spark.conf.set(
    "fs.azure.account.key.kijijidata.dfs.core.windows.net",
    account_key
)

##### check the access to ADLS

In [0]:
display(dbutils.fs.ls("abfs://processed@kijijidata.dfs.core.windows.net"))

path,name,size,modificationTime
abfs://processed@kijijidata.dfs.core.windows.net/apt/,apt/,0,1681313073000
abfs://processed@kijijidata.dfs.core.windows.net/house/,house/,0,1680723313000


In [0]:
display(dbutils.fs.ls("abfs://processed@kijijidata.dfs.core.windows.net/apt/"))

path,name,size,modificationTime
abfs://processed@kijijidata.dfs.core.windows.net/apt/_SUCCESS,_SUCCESS,0,1681313085000
abfs://processed@kijijidata.dfs.core.windows.net/apt/location=Not Available/,location=Not Available/,0,1681313073000
abfs://processed@kijijidata.dfs.core.windows.net/apt/location=acton/,location=acton/,0,1681313073000
abfs://processed@kijijidata.dfs.core.windows.net/apt/location=ajax/,location=ajax/,0,1681313074000
abfs://processed@kijijidata.dfs.core.windows.net/apt/location=ashburn/,location=ashburn/,0,1681313074000
abfs://processed@kijijidata.dfs.core.windows.net/apt/location=aurora/,location=aurora/,0,1681313074000
abfs://processed@kijijidata.dfs.core.windows.net/apt/location=barrie/,location=barrie/,0,1681313074000
abfs://processed@kijijidata.dfs.core.windows.net/apt/location=berczy village/,location=berczy village/,0,1681313074000
abfs://processed@kijijidata.dfs.core.windows.net/apt/location=blackstock/,location=blackstock/,0,1681313074000
abfs://processed@kijijidata.dfs.core.windows.net/apt/location=bolton/,location=bolton/,0,1681313074000


### Analyze the data using Spark

1. Identify the cities with the highest number of listings?
2. Which are the top 5 cities with the highest average rent?
3. Average rent for each city
4. Which city has the most affordable 1 BHK apartment for rent?
5. Identify the most affordable rental option?

In [0]:
from pyspark.sql.functions import lit, col

In [0]:
### read processed apt, house rental listings and combine them into 1 dataframe
path = "abfs://processed@kijijidata.dfs.core.windows.net/"
apt_df = spark.read\
              .parquet(path+"apt/")

In [0]:
house_df = spark.read\
              .parquet(path+"house/")

In [0]:
apt_df = apt_df.withColumn("type", lit("apartment/condo"))
house_df = house_df.withColumn("type", lit("house"))

In [0]:
apt_df.count(), house_df.count()

Out[37]: (9765, 11975)

In [0]:
listing_df = apt_df.union(house_df)

In [0]:
listing_df.count()

Out[39]: 21740

In [0]:
display(listing_df.limit(5))

listing_id,title,rent,bedrooms,post_date,url,scrape_date,calculated_date,year,month,location,type
1653225832,1 bedroom apartment for rent! dufferin st./lawrence ave.!,1750,1,2023-03-13 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/1-bedroom-apartment-for-rent-dufferin-st-lawrence-ave/1653225832,2023-04-03,2023-03-13,2023,3,toronto,apartment/condo
1652711690,greenrock davisville - renovated 1 bedroom suite available,2500,1,2023-04-01 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/greenrock-davisville-renovated-1-bedroom-suite-available/1652711690,2023-04-03,2023-04-01,2023,4,toronto,apartment/condo
1655389794,"one bedroom plus den, direct access to subway, north york",2500,1,2023-04-03 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/one-bedroom-plus-den-direct-access-to-subway-north-york/1655389794,2023-04-03,2023-04-03,2023,4,toronto,apartment/condo
1655389475,"one bedroom, walk to subway , north york",2500,1,2023-04-03 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/one-bedroom-walk-to-subway-north-york/1655389475,2023-04-03,2023-04-03,2023,4,toronto,apartment/condo
1655389447,prime annex ! live in the best part of the city !,2600,1,2023-04-03 00:00:00.000,https://www.kijiji.ca/v-apartments-condos/city-of-toronto/prime-annex-live-in-the-best-part-of-the-city/1655389447,2023-04-03,2023-04-03,2023,4,toronto,apartment/condo


#### Filter outliers from the dataset
* Remove listings with rent higher than 5000 : there were some listings of houses/apt/condo with lease/buy option.
* Remove listings with rent 0 : listings without rent mentioned

In [0]:
listing_df = listing_df\
                     .filter(listing_df.rent < 5000)\
                     .filter(listing_df.rent > 0)

In [0]:
listing_df.count()

Out[42]: 18995

###Identify the cities with the number of listings for the scraped period ?

In [0]:
from pyspark.sql.functions import desc, asc

In [0]:
q1df = listing_df.groupBy("location","type")\
                 .count()\
                 .orderBy(desc("count"), desc("location"))

In [0]:
display(q1df.limit(5))

location,type,count
toronto,apartment/condo,4132
toronto,house,3071
mississauga,house,1449
mississauga,apartment/condo,1317
brampton,house,1014


In [0]:
path = "abfs://presentation@kijijidata.dfs.core.windows.net/" 
q1df.write\
    .mode("overwrite")\
    .parquet(path+"count_per_city")

#### Top cities with the highest average rent?
* remove listings with 0 bed rooms : some listings doe not include the number of beds

In [0]:
from pyspark.sql.functions import round, avg

In [0]:
q2df = listing_df.filter(listing_df.bedrooms > 0)\
                 .groupBy("location","type","bedrooms")\
                 .agg(round(avg("rent"),2).alias("avg_rent"))\
                 .orderBy(desc("avg_rent"))

In [0]:
display(q2df.limit(5))

location,type,bedrooms,avg_rent
vaughan,apartment/condo,5,4800.0
old toronto,house,3,4600.0
vaughan,house,5,4560.0
woodbridge,apartment/condo,4,4500.0
woodbridge,house,4,4500.0


In [0]:
display(q2df[q2df.location=="toronto"].limit(5))

location,type,bedrooms,avg_rent
toronto,house,5,4283.13
toronto,apartment/condo,5,3791.67
toronto,house,4,3372.93
toronto,apartment/condo,4,3142.53
toronto,apartment/condo,3,3106.23


In [0]:
path = "abfs://presentation@kijijidata.dfs.core.windows.net/" 
q2df.write\
    .mode("overwrite")\
    .parquet(path+"avg_rent_per_city")

#### Which city has the most affordable 1 BHK apartment for rent?

In [0]:
display(q2df.filter(q2df.bedrooms==1)\
    .orderBy(asc("avg_rent"))\
    .limit(5))

location,type,bedrooms,avg_rent
bramalea,apartment/condo,1,475.0
courtice,house,1,775.0
caledon east,house,1,800.0
samac,house,1,850.0
old malton village,house,1,900.0


Looking at the avg rent, this is probably a room sharing listing.