# Overview

ADX needs to split source file(s) between 100MB-1GB of uncompressed data before ingesting.

This notebook will show how to:

* Create ADX tables
* Download most recent billing file for a given month
* Split into part
* Upload and Import into ADX

# Setup

Configure the environment variables. Copy `example.env` to `.env` and update the values
```bash
cp example.env .env

# load .env vars (optional)
[ -f .env ] && while IFS= read -r line; do [[ $line =~ ^[^#]*= ]] && eval "export $line"; done < .env
```


Create ADX Tables
```bash
# Create ADX table
adx_cmd = ".create table AmortizedCost (['InvoiceSectionName']:string, ['AccountName']:string, ['AccountOwnerId']:string, ['SubscriptionId']:guid, ['SubscriptionName']:string, ['ResourceGroup']:string, ['ResourceLocation']:string, ['Date']:datetime, ['ProductName']:string, ['MeterCategory']:string, ['MeterSubCategory']:string, ['MeterId']:guid, ['MeterName']:string, ['MeterRegion']:string, ['UnitOfMeasure']:string, ['Quantity']:real, ['EffectivePrice']:real, ['CostInBillingCurrency']:real, ['CostCenter']:string, ['ConsumedService']:string, ['ResourceId']:string, ['Tags']:string, ['OfferId']:string, ['AdditionalInfo']:dynamic, ['ServiceInfo1']:string, ['ServiceInfo2']:string, ['ResourceName']:string, ['ReservationId']:string, ['ReservationName']:string, ['UnitPrice']:real, ['ProductOrderId']:string, ['ProductOrderName']:string, ['Term']:string, ['PublisherType']:string, ['PublisherName']:string, ['ChargeType']:string, ['Frequency']:string, ['PricingModel']:string, ['AvailabilityZone']:string, ['BillingAccountId']:string, ['BillingAccountName']:string, ['BillingCurrencyCode']:string, ['BillingPeriodStartDate']:datetime, ['BillingPeriodEndDate']:datetime, ['BillingProfileId']:string, ['BillingProfileName']:string, ['InvoiceSectionId']:string, ['IsAzureCreditEligible']:string, ['PartNumber']:string, ['PayGPrice']:real, ['PlanName']:string, ['ServiceFamily']:string, ['CostAllocationRuleName']:string, ['benefitId']:string, ['benefitName']:string)"
az kusto query -c "$adx_cluster_name" -d "$adx_database_name" --query "$adx/cmd"

# Create ADX table mapping
adx/cmd = ".create table AmortizedCost ingestion csv mapping 'AmortizedCost/mapping' '[{"column":"InvoiceSectionName", "Properties":{"Ordinal":"0"}},{"column":"AccountName", "Properties":{"Ordinal":"1"}},{"column":"AccountOwnerId", "Properties":{"Ordinal":"2"}},{"column":"SubscriptionId", "Properties":{"Ordinal":"3"}},{"column":"SubscriptionName", "Properties":{"Ordinal":"4"}},{"column":"ResourceGroup", "Properties":{"Ordinal":"5"}},{"column":"ResourceLocation", "Properties":{"Ordinal":"6"}},{"column":"Date", "Properties":{"Ordinal":"7"}},{"column":"ProductName", "Properties":{"Ordinal":"8"}},{"column":"MeterCategory", "Properties":{"Ordinal":"9"}},{"column":"MeterSubCategory", "Properties":{"Ordinal":"10"}},{"column":"MeterId", "Properties":{"Ordinal":"11"}},{"column":"MeterName", "Properties":{"Ordinal":"12"}},{"column":"MeterRegion", "Properties":{"Ordinal":"13"}},{"column":"UnitOfMeasure", "Properties":{"Ordinal":"14"}},{"column":"Quantity", "Properties":{"Ordinal":"15"}},{"column":"EffectivePrice", "Properties":{"Ordinal":"16"}},{"column":"CostInBillingCurrency", "Properties":{"Ordinal":"17"}},{"column":"CostCenter", "Properties":{"Ordinal":"18"}},{"column":"ConsumedService", "Properties":{"Ordinal":"19"}},{"column":"ResourceId", "Properties":{"Ordinal":"20"}},{"column":"Tags", "Properties":{"Ordinal":"21"}},{"column":"OfferId", "Properties":{"Ordinal":"22"}},{"column":"AdditionalInfo", "Properties":{"Ordinal":"23"}},{"column":"ServiceInfo1", "Properties":{"Ordinal":"24"}},{"column":"ServiceInfo2", "Properties":{"Ordinal":"25"}},{"column":"ResourceName", "Properties":{"Ordinal":"26"}},{"column":"ReservationId", "Properties":{"Ordinal":"27"}},{"column":"ReservationName", "Properties":{"Ordinal":"28"}},{"column":"UnitPrice", "Properties":{"Ordinal":"29"}},{"column":"ProductOrderId", "Properties":{"Ordinal":"30"}},{"column":"ProductOrderName", "Properties":{"Ordinal":"31"}},{"column":"Term", "Properties":{"Ordinal":"32"}},{"column":"PublisherType", "Properties":{"Ordinal":"33"}},{"column":"PublisherName", "Properties":{"Ordinal":"34"}},{"column":"ChargeType", "Properties":{"Ordinal":"35"}},{"column":"Frequency", "Properties":{"Ordinal":"36"}},{"column":"PricingModel", "Properties":{"Ordinal":"37"}},{"column":"AvailabilityZone", "Properties":{"Ordinal":"38"}},{"column":"BillingAccountId", "Properties":{"Ordinal":"39"}},{"column":"BillingAccountName", "Properties":{"Ordinal":"40"}},{"column":"BillingCurrencyCode", "Properties":{"Ordinal":"41"}},{"column":"BillingPeriodStartDate", "Properties":{"Ordinal":"42"}},{"column":"BillingPeriodEndDate", "Properties":{"Ordinal":"43"}},{"column":"BillingProfileId", "Properties":{"Ordinal":"44"}},{"column":"BillingProfileName", "Properties":{"Ordinal":"45"}},{"column":"InvoiceSectionId", "Properties":{"Ordinal":"46"}},{"column":"IsAzureCreditEligible", "Properties":{"Ordinal":"47"}},{"column":"PartNumber", "Properties":{"Ordinal":"48"}},{"column":"PayGPrice", "Properties":{"Ordinal":"49"}},{"column":"PlanName", "Properties":{"Ordinal":"50"}},{"column":"ServiceFamily", "Properties":{"Ordinal":"51"}},{"column":"CostAllocationRuleName", "Properties":{"Ordinal":"52"}},{"column":"benefitId", "Properties":{"Ordinal":"53"}},{"column":"benefitName", "Properties":{"Ordinal":"54"}}]'"
az kusto query -c "$adx_cluster_name" -d "$adx_database_name" --query "$adx_cmd"
```


In [None]:
%pip install --editable ../.

In [None]:
from dotenv import load_dotenv
load_dotenv()

## Get Latest Files

Billing files are exported daily into a path like `Azure/Amortized/20240501-20240531/csv/`. The following code will get the latest file for a given month.


In [None]:
# Get most recent files for a given directory
import os
import sys
from prep.exports import get_most_recent_file

connection_string = os.getenv("STORAGE_CONNECTION_STRING")
container_name = os.getenv("CONTAINER_NAME")
prefix = os.getenv("PREFIX")

# Get the most recent files for a given directory
billing_file_infos = get_most_recent_file(connection_string, container_name, prefix)

## Download file

Download the latest file for the month in order to split into parts

In [None]:
# Download file
from billing.blob_storage import copy_blob_as_azcopy
from billing import util
import logging

project_root = os.path.dirname(os.getcwd())

util.setup_logging(default_path=project_root + "/logging.yaml")
_LOGGER = logging.getLogger(__name__)
_LOGGER.info("Starting script")

file_name, file_size, file_url = billing_file_infos[1]
sas_key = os.getenv("EXPORT_SAS")
destination_dir = os.path.join(project_root, "temp")
destination_file = os.path.join(destination_dir, file_name.replace('/', '_'))

copy_blob_as_azcopy(f"{file_url}?{sas_key}", destination_file)

## Split file into Parts

Split the file into parts of 100MB-1GB. Running the split will also get `stats` for each file. This includes the total number of rows and the total cost for the file. This is helpful for validating data.

In [None]:
# Split file into parts
from billing.blob_storage import split_local_csv_file
from billing import util
import logging
import os

project_root = os.path.dirname(os.getcwd())

util.setup_logging(default_path=project_root + "/logging.yaml")
_LOGGER = logging.getLogger(__name__)
_LOGGER.info("Starting script")

destination_file = "../temp/Azure_Actual_20230501-20230531_csv_Azure_ActualCost_v20230605T004903Z_574afa1e-0724-4291-a5f9-e82743b07f23.csv"

# Split the file
stats = split_local_csv_file(destination_file, skip_header=True)
print(f"stats: rows: {stats[-1][1]}, cost: {stats[-1][2]}")

# Remove large file
os.system(f"rm {destination_file}")
# os.system(f"az storage blob download --connection-string {connection_string} --container-name {container_name} --name {file} --file {file}")


## Upload Parts to Blob Storage

ADX will import the files as parts

In [None]:
import re
from billing.blob_storage import copy_blob_as_azcopy
from billing import util
import logging
import os

project_root = os.path.dirname(os.getcwd())

util.setup_logging(default_path=project_root + "/logging.yaml")
_LOGGER = logging.getLogger(__name__)
_LOGGER.info("Upload split files abbababb")

# Upload Split files to Storage
destination_file="../temp/Azure_Actual_20230501-20230531_csv_Azure_ActualCost_v20230605T004903Z_574afa1e-0724-4291-a5f9-e82743b07f23.csv"

storage_account_name = os.getenv("STORAGE_ACCOUNT_NAME")
dest_container_name = os.getenv("EXPORT_LATEST_PARTS_CONTAINER")
prefix = "Azure/Actual/"
dest_sas = os.getenv("EXPORT_LATEST_PARTS_SAS")

source_path = os.path.splitext(destination_file)[0]
yearmonth = re.search(r'(\d{8})-\d{8}', destination_file).group(1)[:6]
destination_path = f"https://{storage_account_name}.blob.core.windows.net/{dest_container_name}/{prefix}/{yearmonth}?{dest_sas}"

_LOGGER.info("Upload %s/*.csv to /%s", source_path, destination_path)
copy_blob_as_azcopy(f"{source_path}/*.csv", destination_path, recursive=True)
# echo "Upload ${source_path}/*.csv to /${dest_container_name}/${prefix}/${yearmonth}"
# tools/azcopy copy "${source_path}/*.csv" "https://${storage_account_name}.blob.core.windows.net/${dest_container_name}/${prefix}/${yearmonth}?${dest_sas}" --recursive


# Ingest into ADX

The `LightIngest` tool can import files from a storage account into ADX. The tool will also validate the data before importing.

Bash
```bash
table_name=AmortizedCost
mapping=AmortizedCost_mapping
container_name="${EXPORT_LATEST_PARTS_CONTAINER}"
prefix="Azure/Actual/20221101-20221130"
source_sas="${EXPORT_LATEST_PARTS_SAS}"
tools/LightIngest "https://ingest-${adx_cluster_name}.${adx_region}.kusto.windows.net;Fed=True" \
    -database:${adx_database_name} \
    -table:"$table_name" \
    -source:"https://${storage_account_name}.blob.core.windows.net/${container_name}?${source_sas}" \
    -format:csv \
    -prefix:"${prefix}" \
    -pattern:"*.csv" \
    -ingestionMappingRef:"$mapping" \
    -creationTimePattern:"'${prefix}/'yyyyMMdd'/'" \
    -dontWait:true
```

PowerShell
```powershell
# Load env variables (optional)
Get-Content .env | ForEach-Object {
    $name, $value = $_.split('=', 2)
    if (-not [string]::IsNullOrWhiteSpace($name) -and -not $name.Contains('#')) {
        $cleanedValue = $value -replace '"', ''
        Set-Content env:$name $cleanedValue
    }
}

$table_name="AmortizedCost"
$mapping="AmortizedCost_mapping"
$container_name="${env:EXPORT_LATEST_PARTS_CONTAINER}"
$prefix="Azure/Actual/20221101-20221130"
$source_sas="${env:EXPORT_LATEST_PARTS_SAS}"
tools/LigtIngest "https://ingest-${env:adx_cluster_name}.${env:adx_region}.kusto.windows.net;Fed=True" `
    -database:"${env:adx_database_name}" `
    -table:"$table_name" `
    -source:"https://${env:storage_account_name}.blob.core.windows.net/${container_name}?${source_sas}" `
    -format:csv `
    -prefix:"${prefix}" `
    -pattern:"*.csv" `
    -ingestionMappingRef:"$mapping" `
    -creationTimePattern:"'${prefix}/'yyyyMMdd'/'" `
    -dontWait:true
```


Helpful ADX queries

```bash
# Data Quality Checks
AmortizedCost
| summarize count() by ['Date']

# Min Max Date
['AmortizedCost']
| summarize min(['Date']), max(['Date'])

# Cost by month
AmortizedCost
| summarize cost=round(sum(CostInBillingCurrency), 2) by startofmonth = startofmonth(['Date'])

AmortizedCost
| summarize cost=round(sum(CostInBillingCurrency),2) by MeterCategory, month = startofmonth(['Date'])
| project year_month=format_datetime(month, 'yyyy-MM'), cost
| summarize cost=sum(cost) by year_month
| order by year_month asc

# Cost by month and category
AmortizedCost
| summarize Costs = round(sum(CostInBillingCurrency),2) by MeterCategory, Month = startofmonth(['Date'])

AmortizedCost
| extend year_month=format_datetime(['Date'], 'yyyy-MM')
| summarize cost=sum(CostInBillingCurrency) by year_month
| order by year_month asc

# CostGrowth over time
let PreviousCosts = AmortizedCost
| where ['Date'] between (datetime(2023-05-01) .. datetime(2023-05-31))
| summarize PreviousCost = sum(CostInBillingCurrency) by MeterCategory;
let CurrentCosts = AmortizedCost
| where ['Date'] between (datetime(2024-05-01) .. datetime(2024-05-31))
| summarize CurrentCost = sum(CostInBillingCurrency) by MeterCategory;
CurrentCosts
| join kind = inner (
    PreviousCosts
    ) on MeterCategory
| project MeterCategory, round(CurrentCost,2), round(PreviousCost,2), CostGrowth= round((CurrentCost - PreviousCost) ,2)
| sort by CostGrowth

.show ingestion mappings

# Show tag key names and counts
AmortizedCost
| where Date between (datetime(2024-05-01) .. datetime(2024-05-31))
| extend TagJson = strcat("{", tolower(tostring(Tags)), "}")
| extend ParsedTags = parse_json(TagJson)
| mv-expand Tag = ParsedTags
| extend TagKey = tostring(bag_keys(Tag)[0])
| extend TagValue = tostring(Tag[TagKey])
| summarize UniqueValuesCount = dcount(TagValue) by TagKey
| sort by UniqueValuesCount
```

## Clean up temp Files

Remove the temp files

```bash
rm "${source_path}"/*.csv
``` 