In [None]:
# ETL >> Extract Data 
import os
import requests
from zipfile import ZipFile
import csv
import tempfile

#path
base_path = ""
# External website file url
source_url = 'https://assets.datacamp.com/production/repositories/5899/datasets/66691278303f789ca4acd3c6406baa5fc6adaf28/PPR-ALL.zip'
# Source path where we want to save the .zip file downloaded from the website
source_path = f"{base_path}/data/source/downloaded_at=2021-01-01/ppr-all.zip"

# Raw path where we want to extract the new .csv data
raw_path = f"{base_path}/data/raw/downloaded_at=2021-01-01/ppr-all.csv"

# END - Paths for new February 2021 data available

# create folder by os system module to save the files

def create_directory_if_not_exists(path):
  """
  Create a new directory if it doesn't exists
  """
   # os.path.dirname() returns up to the directory path.
   # In this case it is: f"{base_path}/downloaded_at=2021-01-01"
   # "ppr-all.zip" is excluded
  os.makedirs(os.path.dirname(path), exist_ok=True)

# Create download function by with ... open() as fp:

def download_snapshot():
  
  create_directory_if_not_exists(source_path)
  with open(source_path, "wb") as source_ppr:
    response = requests.get(source_url, verify=False)
    source_ppr.write(response.content)


#  Save new raw data from the source
def save_new_raw_data():
  """
    Save new raw data from the source
  """
  create_directory_if_not_exists(raw_path)
  with tempfile.TemporaryDirectory() as dirpath:  # make a temporary directory and saved csv file 
        with ZipFile(          # 1- extract zip file and show the name a lists 
            source_path,
            "r",
        ) as zipfile:
            names_list = zipfile.namelist()
            csv_file_path = zipfile.extract(names_list[0], path=dirpath)
            # Open the CSV file in read mode   2- open csv file and read
            with open(csv_file_path, mode="r", encoding="windows-1252") as csv_file:
                reader = csv.DictReader(csv_file)

                row = next(reader)  # Get first row from reader
                print("[Extract] First row example:", row)

                # Open the CSV file in write mode  3- open csv file and write and rename field names
                with open(
                    raw_path,
                    mode="w",
                    encoding="windows-1252"
                ) as csv_file:
                    # Rename field names so they're ready for the next step
                    fieldnames = {
                        "Date of Sale (dd/mm/yyyy)": "date_of_sale",
                        "Address": "address",
                        "Postal Code": "postal_code",
                        "County": "county",
                        "Price (€)": "price",
                        "Description of Property": "description",
                    }
                    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
                    # Write headers as first line
                    writer.writerow(fieldnames)
                    for row in reader:
                        # Write all rows in file
                        writer.writerow(row)
                        

# Main function called inside the execute.py script
def main():
    print("[Extract] Start")
    print("[Extract] Downloading snapshot")
    download_snapshot()
    print(f"[Extract] Saving data from '{source_path}' to '{raw_path}'")
    save_new_raw_data()
    print(f"[Extract] End")


# main()