In [0]:
import requests
import json
import pyspark.sql.functions as F
from pyspark.sql.types import StructField, StructType, StringType, ArrayType, LongType, TimestampType, BinaryType, IntegerType, DateType
from datetime import datetime, timedelta
import time
import re
from azure.storage.blob import BlobClient, generate_blob_sas, BlobSasPermissions
import os

In [0]:
dbutils.widgets.text("training_startTime", "2018-01-01 00:00:00") 
dbutils.widgets.text("training_endTime", "2020-01-01 00:00:00") 
dbutils.widgets.text("fab", "D21") 
dbutils.widgets.text("period", "D") 

training_startTime =  dbutils.widgets.get("training_startTime")
training_endTime = dbutils.widgets.get("training_endTime")
fab =  dbutils.widgets.get("fab")
period = dbutils.widgets.get("period") 

training_startTime = datetime(2018, 1, 1, 12, 0, 0).strftime('%Y-%m-%d 00:00:00')
training_endTime = datetime(2020, 1, 1, 12, 0, 0).strftime('%Y-%m-%d 00:00:00')
detect_startTime = datetime(2021, 1, 1, 12, 0, 0).strftime('%Y-%m-%d 00:00:00')
detedct_endTime = datetime(2021, 1, 31, 12, 0, 0).strftime('%Y-%m-%d 00:00:00')
fab = "D21"
period = "D"


account_name = 'datalakecpcdev'
account_key = 'mmFh5mNQfu1zce+PpcS3vyn/Z9FL2ezzzhdZwFniID/1BfpLip/+0jTYGf17sRoQ23qXxo1cWHvm4ERif79MiA=='
container_name = f'zipfile/{fab}/{period}'

ENDPOINT = "anomalycpcoil.cognitiveservices.azure.com/anomalydetector/v1.1-preview"
HEADERS = {"Ocp-Apim-Subscription-Key": "fc936a7c2bf04329974ba77269b23116"}

In [0]:
# %sql
# select timestamp, count(1) from D21_3S_TRANS_OIL_AZURE group by timestamp order by 1

**generate data source sasurl**

In [0]:
def get_blob_sas(account_name,account_key, container_name, blob_name):
    sas_blob = generate_blob_sas(account_name=account_name, 
                                container_name=container_name,
                                blob_name=blob_name,
                                account_key=account_key,
                                permission=BlobSasPermissions(read=True),
                                expiry=datetime.utcnow() + timedelta(days=1))
    return sas_blob


In [0]:
blobsasurl_list = []
for blob_name in os.listdir(f'/dbfs/mnt/{container_name}'):
    blobsasurl_dist ={}
    blobsas = get_blob_sas(account_name,account_key, container_name, blob_name)
    url = f'https://{account_name}.blob.core.windows.net/{container_name}/{blob_name}?{blobsas}'
    blobsasurl_dist['station'] = blob_name.split('.')[0]
    blobsasurl_dist['blobsasurl'] = url
    blobsasurl_list.append(blobsasurl_dist)
    
blobsasurl_list

In [0]:
API_MODEL = "https://{endpoint}/multivariate/models"
API_MODEL_STATUS = "https://{endpoint}/multivariate/models/{model_id}"
API_MODEL_INFERENCE = "https://{endpoint}/multivariate/models/{model_id}/detect"
API_RESULTS = "https://{endpoint}/multivariate/results/{result_id}"
API_EXPORT = "https://{endpoint}/multivariate/models/{model_id}/export"
API_DELETE = "https://{endpoint}/multivariate/models/{model_id}"
SOURCE_BLOB_SAS = "{blobsasstring}"

**check current # of model in the anomaly detector multivariacne service**

In [0]:
res = requests.get(API_MODEL.format(endpoint=ENDPOINT), headers=HEADERS)
assert res.status_code == 200, f"Error occured. Error message: {res.content}"
result_json = json.loads(res.content.decode('utf-8'))
models_list = result_json['models']

# model count
assert (300-result_json['currentCount']) > len(blobsasurl_list), f"will exceed the maximun of the model limitation"

print(f"current model count is {result_json['currentCount']}")
#df_raw.select('modelId').collect()[0].modelId

**init delete all models in the detector multivariacne service**

In [0]:
have_models = True
while(have_models):
    res = requests.get(API_MODEL.format(endpoint=ENDPOINT), headers=HEADERS)
    assert res.status_code == 200, f"Error occured. Error message: {res.content}"
    result_json= json.loads(res.content.decode('utf-8'))
    models_list = result_json['models']
    print(f"current model:{result_json['currentCount']}" )
    if result_json['currentCount'] == 0 :
        have_models = False
    else:
        for model in models_list:
            model_id = model['modelId']
            res = requests.delete(API_DELETE.format(endpoint=ENDPOINT, model_id=model_id), headers=HEADERS)
            assert res.status_code == 204, f"Error occured. Error message: {res.content}"
            print(model_id)

**Train Models**

In [0]:
# def model_training(source_blob_sas):
#     data = {
#         'slidingWindow': 200,
#         'alignPolicy': {
#             'alignMode': 'Outer',
#             'fillNAMethod': 'Linear', 
#             'paddingValue': 0
#         },
#         'source': source_blob_sas,
#         'startTime': training_startTime, 
#         'endTime': training_endTime, 
#         'displayName': 'SampleRequest'
#     }
#     res = requests.post(API_MODEL.format(endpoint=ENDPOINT), data=json.dumps(data), headers=HEADERS)
#     assert res.status_code == 201, f"Error occured. Error message: {res.content}"
#     model_id = res.headers['location'].split("/")[-1]
#     print(model_id)
#     is_model_waiting_created = True
#     while(is_model_waiting_created):
#         res = requests.get(API_MODEL_STATUS.format(endpoint=ENDPOINT, model_id = model_id), headers=HEADERS)
#         assert res.status_code == 200, f"Error occured. Error message: {res.content}"
#         model_status = json.loads(res.content)['modelInfo']['status']
#         print(model_status)
#         if(model_status == "READY"):
#             is_model_waiting_created = False
#         if(model_status == "FAILED"):
#             is_model_waiting_created = False
#         time.sleep(10)  

#     return model_id

In [0]:
def model_training(source_blob_sas):
    data = {
        'slidingWindow': 300,
        'alignPolicy': {
            'alignMode': 'Outer',
            'fillNAMethod': 'Linear', 
            'paddingValue': 0
        },
        'source': source_blob_sas,
        'startTime': training_startTime, 
        'endTime': training_endTime, 
        'displayName': 'SampleRequest'
    }
    is_model_waiting_train = True
    while(is_model_waiting_train):
        res = requests.post(API_MODEL.format(endpoint=ENDPOINT), data=json.dumps(data), headers=HEADERS)
        if(res.status_code == 201):
            is_model_waiting_train=False
            #print(f"Error occured. Error message: {res.content}")
        time.sleep(10)
    model_id = res.headers['location'].split("/")[-1]
    print(model_id)
    time.sleep(2)  

    return model_id

In [0]:
trained_model_list = []

for blob_name in blobsasurl_list:
    trained_model_dist = {}
    print(blob_name['station'])
#     print(blob_name['blobsasurl'])
    model_id = model_training(blob_name['blobsasurl'])
    trained_model_dist['station'] = blob_name['station']
    trained_model_dist['period'] = period
    trained_model_dist['fab'] = fab
#     trained_model_dist['blobsasurl'] = blob_name['blobsasurl']
    trained_model_dist['model_id'] = model_id

    trained_model_dist['model_training_time'] = datetime.utcnow().strftime('%Y-%m-%d %hh:%mm:%ss')
    trained_model_dist['training_startTime'] = training_startTime
    trained_model_dist['training_endTime'] = training_endTime
    trained_model_list.append(trained_model_dist)
    print('done!')
    

**check all model was created**

In [0]:
def model_status(model_id):

    is_model_waiting_created = True  
    while(is_model_waiting_created):
        res = requests.get(API_MODEL_STATUS.format(endpoint=ENDPOINT, model_id = model_id), headers=HEADERS)
        assert res.status_code == 200, f"Error occured. Error message: {res.content}"
        model_status = json.loads(res.content)['modelInfo']['status']
        print(model_status)
        if(model_status == "READY"):
            is_model_waiting_created = False
        if(model_status == "FAILED"):
            is_model_waiting_created = False
            

    return model_status

In [0]:
for row in trained_model_list:
    model_status(row['model_id'])
    print(row['station'])
print("model is ready")

**Insert trained models metadata**

In [0]:
rdd = spark.sparkContext.parallelize(trained_model_list)
df_trained_model_list = spark.createDataFrame(rdd)

(df_trained_model_list.write
         .format('delta')
         .mode('append')
         .save('/mnt/deltalake/trained_model_log')
)

display(df_trained_model_list)

fab,model_id,model_training_time,period,station,training_endTime,training_startTime
D21,30d4acfc-c2d7-11ec-8134-62328621ed98,2022-04-23T07:30:07.726+0000,D,01a37bc3b6a23ccdaf5b,2020-01-01 00:00:00,2018-01-01 00:00:00
D21,33cab49c-c2d7-11ec-8f00-36706d3f2759,2022-04-23T07:30:12.627+0000,D,021512a0fa0166f54dd7,2020-01-01 00:00:00,2018-01-01 00:00:00
D21,36b1b6c4-c2d7-11ec-81b7-f6d12f880d24,2022-04-23T07:30:17.507+0000,D,05f098ac2faab7324ae5,2020-01-01 00:00:00,2018-01-01 00:00:00
D21,3993d11a-c2d7-11ec-8134-62328621ed98,2022-04-23T07:30:22.371+0000,D,07b8ada8541584b38731,2020-01-01 00:00:00,2018-01-01 00:00:00
D21,3c89a7aa-c2d7-11ec-81b7-f6d12f880d24,2022-04-23T07:30:27.316+0000,D,0a0da4158168d1679b8a,2020-01-01 00:00:00,2018-01-01 00:00:00
D21,3f6e19ba-c2d7-11ec-a75a-36706d3f2759,2022-04-23T07:30:32.215+0000,D,0b08c67ca54995f4451d,2020-01-01 00:00:00,2018-01-01 00:00:00
D21,425f4298-c2d7-11ec-a75a-36706d3f2759,2022-04-23T07:30:37.096+0000,D,0fb6ba15172179e2f690,2020-01-01 00:00:00,2018-01-01 00:00:00
D21,454e35ea-c2d7-11ec-8134-62328621ed98,2022-04-23T07:30:42.058+0000,D,15787192a2cf526deaf8,2020-01-01 00:00:00,2018-01-01 00:00:00
D21,483bc646-c2d7-11ec-b8a7-f6d12f880d24,2022-04-23T07:30:46.959+0000,D,1b06cfe8d00e1293ef0c,2020-01-01 00:00:00,2018-01-01 00:00:00
D21,4b25364e-c2d7-11ec-8f00-36706d3f2759,2022-04-23T07:30:51.826+0000,D,20cc23709cc1ee2e354a,2020-01-01 00:00:00,2018-01-01 00:00:00


**get the newest models(group by model metadata)**

group by model metadata ( station , fab, period ) to in order to delete the model

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import monotonically_increasing_id, row_number

windowSpec  = Window.partitionBy("station","fab","period").orderBy(F.col("model_training_time").desc())


df_trained_model_list = (spark.read
         .format('delta')
         .load('/mnt/deltalake/trained_model_log')
         .select('station','fab','period','model_training_time','model_id')
         .withColumn("id",row_number().over(windowSpec))
         .filter('id == 1')
)

display(df_trained_model_list)

station,fab,period,model_training_time,model_id,id
01a37bc3b6a23ccdaf5b,D21,D,2022-04-23T07:30:07.726+0000,30d4acfc-c2d7-11ec-8134-62328621ed98,1
021512a0fa0166f54dd7,D21,D,2022-04-23T07:30:12.627+0000,33cab49c-c2d7-11ec-8f00-36706d3f2759,1
05f098ac2faab7324ae5,D21,D,2022-04-23T07:30:17.507+0000,36b1b6c4-c2d7-11ec-81b7-f6d12f880d24,1
07b8ada8541584b38731,D21,D,2022-04-23T07:30:22.371+0000,3993d11a-c2d7-11ec-8134-62328621ed98,1
0a0da4158168d1679b8a,D21,D,2022-04-23T07:30:27.316+0000,3c89a7aa-c2d7-11ec-81b7-f6d12f880d24,1
0b08c67ca54995f4451d,D21,D,2022-04-23T07:30:32.215+0000,3f6e19ba-c2d7-11ec-a75a-36706d3f2759,1
0fb6ba15172179e2f690,D21,D,2022-04-23T07:30:37.096+0000,425f4298-c2d7-11ec-a75a-36706d3f2759,1
15787192a2cf526deaf8,D21,D,2022-04-23T07:30:42.058+0000,454e35ea-c2d7-11ec-8134-62328621ed98,1
1b06cfe8d00e1293ef0c,D21,D,2022-04-23T07:30:46.959+0000,483bc646-c2d7-11ec-b8a7-f6d12f880d24,1
20cc23709cc1ee2e354a,D21,D,2022-04-23T07:30:51.826+0000,4b25364e-c2d7-11ec-8f00-36706d3f2759,1


**delete model**

delet model after we created the new model for each station

In [0]:
model_newest_list = df_trained_model_list.select('model_id').rdd.map(lambda x : x.model_id).collect()

delete_models = True
while(delete_models):
    res = requests.get(API_MODEL.format(endpoint=ENDPOINT), headers=HEADERS)
    assert res.status_code == 200, f"Error occured. Error message: {res.content}"
    result_json= json.loads(res.content.decode('utf-8'))
    models_list = result_json['models']
    print(f"current model:{result_json['currentCount']}" )
    if result_json['currentCount'] <100 :
        delete_models = False
    else:
        for model in models_list:
            model_id = model['modelId']
            if model_id not in model_newest_list:
                res = requests.delete(API_DELETE.format(endpoint=ENDPOINT, model_id=model_id), headers=HEADERS)
                assert res.status_code == 204, f"Error occured. Error message: {res.content}"
                print(model_id)