# Import Libraries

In [1]:
import datetime
import logging
import time
import azure.functions as func
import pandas as pd

from datetime import date, timedelta
from shared_code import utils

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Initialize Storage Account

In [2]:
azure_utils = utils.AzureUtils()
secret_client = azure_utils.initialize_key_vault()

sa_secret = azure_utils.get_key_vault_secret(secret_client, 'sa-booli')
sa_name = azure_utils.get_key_vault_secret(secret_client, 'sa-name')
storage_account = azure_utils.initialize_storage_account_ad(sa_secret.value, sa_name.value)

# Scraping & Storing Data

In [None]:
object_type = "Lägenhet"
minSoldDate = "2012-01-01"
maxSoldDate = "2012-12-31"
rooms = ""
area_id = 143

booli_utils = utils.Booli()

data = booli_utils.run_query_sold(object_type, minSoldDate, maxSoldDate, rooms, area_id, 1)
total_number_of_pages = data["data"]["search"]["pages"]
total_number_of_objects = data["data"]["search"]["totalCount"]
total_number_of_pages

res = []
for page in range(1, total_number_of_pages + 1):
    print(f'Scraping page: {page} / {total_number_of_pages}')
    data = booli_utils.run_query_sold(object_type, minSoldDate, maxSoldDate, rooms, area_id, page)
    
    for object in data["data"]["search"]["result"]:
        res.append(object)
    
    print(f"Total number of objects stored {len(res)} / {total_number_of_objects}")
    print("")
    print("--------------------------------------------")
    print("")
    
df = pd.json_normalize(res)

azure_utils.upload_csv_to_datalake(df, "raw/sold", f"Sold_{object_type}_{rooms}_{area_id}_{minSoldDate}_{maxSoldDate}.csv")

# Data Cleaning

In [4]:
utils_DataFactory = utils.DataFactory()
data_cleaning = utils_DataFactory.get_formatter("Cleaning")

columns_to_drop = ["floor", "soldPriceAbsoluteDiff", "soldPricePercentageDiff", "listPrice", "rooms", "soldSqmPrice", "livingArea", "Unnamed: 0"]
non_numeric_columns = ["streetAddress", "objectType", "descriptiveAreaName", "soldPriceType", "soldDate", "url", "__typename"]

df_cleaning = (
    azure_utils.ingest_raw_data("Sold/", "Sold_")
    .pipe(data_cleaning.drop_dataframe_columns, columns_to_drop)
    .pipe(data_cleaning.set_dtype_to_numeric, non_numeric_columns)
    .pipe(data_cleaning.change_timestamp_format, "soldDate")
)
df_cleaning.head()

HttpResponseError: The requested URI does not represent any resource on the server.
RequestId:4432306b-d01e-0046-0904-fad6d6000000
Time:2022-11-16T21:44:48.8094249Z
ErrorCode:InvalidUri
Content: <?xml version="1.0" encoding="utf-8"?>
<Error><Code>InvalidUri</Code><Message>The requested URI does not represent any resource on the server.
RequestId:4432306b-d01e-0046-0904-fad6d6000000
Time:2022-11-16T21:44:48.8094249Z</Message></Error>

In [None]:
# pd.DataFrame(df_cleaning.descriptiveAreaName.unique())

## Uploading Cleaned Data to Data Lake

In [None]:
parquet_file = df_cleaning.to_parquet(index = False)

azure_utils.upload_blob(parquet_file, f"silver/sold_clean", f"Sold_{date.today()}.parquet")   

# Feature Engineering

In [None]:
data_cleaning.get_data_types(df_cleaning)

In [None]:
data_cleaning.get_missing_values_percent(df_raw)