In [0]:
import requests
import json
import pyspark.sql.functions as F
from pyspark.sql.types import StructField, StructType, StringType, ArrayType, LongType, TimestampType, BinaryType, IntegerType, DateType
from datetime import datetime, timedelta
import time
import re
from azure.storage.blob import BlobClient, generate_blob_sas, BlobSasPermissions
import os

In [0]:
dbutils.widgets.text("training_startTime", "2018-01-01T00:00:00")
dbutils.widgets.text("training_endTime", "2020-01-01T00:00:00") 
dbutils.widgets.text("fab", "D21") 
dbutils.widgets.text("period", "D")
dbutils.widgets.text("slidingWindow", "40") 

training_startTime =  dbutils.widgets.get("training_startTime")
training_endTime = dbutils.widgets.get("training_endTime")
fab =  dbutils.widgets.get("fab")
period = dbutils.widgets.get("period")
slidingWindow = int(dbutils.widgets.get("slidingWindow"))

format_datatime = "%Y-%m-%dT%H:%M:%SZ"
format_date = "%Y-%m-%d"
training_startDate = datetime.strptime(training_startTime, format_datatime).strftime(format_date)
training_endDate = datetime.strptime(training_endTime, format_datatime).strftime(format_date)

# training_startTime = datetime(2018, 1, 1, 0, 0, 0).strftime('%Y-%m-%d 00:00:00')
# training_endTime = datetime(2022, 4, 1, 0, 0, 0).strftime('%Y-%m-%d 00:00:00')
# fab = "D21"
# period = "M"


account_name = 'datalakecpcdev'
account_key = dbutils.secrets.get(scope = 'cpc-keyvault-dev', key = 'datalakegne2-datalakecpcdev-key')
container_name = f'zipfile/{fab}/{period}'

#20220505_聖元新增
if period == 'D': 
    ENDPOINT = "anomalycpcoil.cognitiveservices.azure.com/anomalydetector/v1.1-preview"
    HEADERS = {"Ocp-Apim-Subscription-Key": dbutils.secrets.get(scope = 'cpc-keyvault-dev', key = 'cognitiveservice-anomalycpcoil-secret')}
elif period == 'W':
    ENDPOINT = "w-anomalyoil.cognitiveservices.azure.com/anomalydetector/v1.1-preview"
    HEADERS = {"Ocp-Apim-Subscription-Key": dbutils.secrets.get(scope = 'cpc-keyvault-dev', key = 'cognitiveservice-wanomalyoil-secret')}
elif period == 'M':
    ENDPOINT = "m-anomalyoil.cognitiveservices.azure.com/anomalydetector/v1.1-preview"
    HEADERS = {"Ocp-Apim-Subscription-Key": dbutils.secrets.get(scope = 'cpc-keyvault-dev', key = 'cognitiveservice-manomalyoil-secret')}

In [0]:
print(fab, period, training_startTime, training_endTime ,slidingWindow)
print(ENDPOINT, HEADERS)

In [0]:
# %sql
# select timestamp, count(1) from D21_3S_TRANS_OIL_AZURE group by timestamp order by 1

**Generate data source sasurl**

In [0]:
def get_blob_sas(account_name,account_key, container_name, blob_name):
    sas_blob = generate_blob_sas(account_name=account_name, 
                                container_name=container_name,
                                blob_name=blob_name,
                                account_key=account_key,
                                permission=BlobSasPermissions(read=True),
                                expiry=datetime.utcnow() + timedelta(days=1))
    return sas_blob


In [0]:
blobsasurl_list = []
for blob_name in os.listdir(f'/dbfs/mnt/{container_name}'):
    blobsasurl_dist ={}
    blobsas = get_blob_sas(account_name,account_key, container_name, blob_name)
    url = f'https://{account_name}.blob.core.windows.net/{container_name}/{blob_name}?{blobsas}'
    blobsasurl_dist['station'] = blob_name.split('.')[0]
    blobsasurl_dist['blobsasurl'] = url
    blobsasurl_list.append(blobsasurl_dist)
    
blobsasurl_list

In [0]:
API_MODEL = "https://{endpoint}/multivariate/models?$top=300"
API_MODEL_STATUS = "https://{endpoint}/multivariate/models/{model_id}"
API_MODEL_INFERENCE = "https://{endpoint}/multivariate/models/{model_id}/detect"
API_RESULTS = "https://{endpoint}/multivariate/results/{result_id}"
API_EXPORT = "https://{endpoint}/multivariate/models/{model_id}/export"
API_DELETE = "https://{endpoint}/multivariate/models/{model_id}"
SOURCE_BLOB_SAS = "{blobsasstring}"

**Check current # of model in the anomaly detector multivariacne service**

In [0]:
res = requests.get(API_MODEL.format(endpoint=ENDPOINT), headers=HEADERS)
assert res.status_code == 200, f"Error occured. Error message: {res.content}"
result_json = json.loads(res.content.decode('utf-8'))
models_list = result_json['models']

# model count
assert (300-result_json['currentCount']) > len(blobsasurl_list), f"will exceed the maximun of the model limitation"

print(f"current model count is {result_json['currentCount']}")
#df_raw.select('modelId').collect()[0].modelId

# models_list = result_json['models']
# rdd = spark.sparkContext.parallelize(models_list)
# df_models = spark.createDataFrame(rdd)
# display(df_models)

**(Removed)Init delete all models in the detector multivariacne service**

In [0]:
# res = requests.get(API_MODEL.format(endpoint=ENDPOINT), headers=HEADERS)
# assert res.status_code == 200, f"Error occured. Error message: {res.content}"
# result_json= json.loads(res.content.decode('utf-8'))
# models_list = result_json['models']
# print(f"current model:{result_json['currentCount']}" )
# if result_json['currentCount'] > 0 :
#     for model in models_list:
#         model_id = model['modelId']
#         print(model_id)
#         #res = requests.delete(API_DELETE.format(endpoint=ENDPOINT, model_id=model_id), headers=HEADERS)
#         #assert res.status_code == 204, f"Error occured. Error message: {res.content}"
#         print(model_id)

**Train Models**

In [0]:
def model_training(source_blob_sas ,station):
    data = {
        'slidingWindow': slidingWindow,
        'alignPolicy': {
            'alignMode': 'Outer',
            'fillNAMethod': 'Linear', 
            'paddingValue': 0
        },
        'source': source_blob_sas,
        'startTime': training_startTime, 
        'endTime': training_endTime, 
        'displayName': f'{period}_{station}'
    }
    is_model_waiting_train = True
    while(is_model_waiting_train):
        res = requests.post(API_MODEL.format(endpoint=ENDPOINT), data=json.dumps(data), headers=HEADERS)
        if(res.status_code == 201):
            is_model_waiting_train=False
            #print(f"Error occured. Error message: {res.content}")
        time.sleep(10)
    model_id = res.headers['location'].split("/")[-1]
    print(model_id)
    time.sleep(2)  

    return model_id

In [0]:
trained_model_list = []

for blob_name in blobsasurl_list:
    trained_model_dist = {}
    print(blob_name['station'])
#     print(blob_name['blobsasurl'])
    model_id = model_training(blob_name['blobsasurl'] ,blob_name['station'])
    trained_model_dist['station'] = blob_name['station']
    trained_model_dist['period'] = period
    trained_model_dist['fab'] = fab
#     trained_model_dist['blobsasurl'] = blob_name['blobsasurl']
    trained_model_dist['model_id'] = model_id

    trained_model_dist['model_training_time'] = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
    trained_model_dist['training_startTime'] = training_startTime
    trained_model_dist['training_endTime'] = training_endTime
    trained_model_list.append(trained_model_dist)
    print('done!')
    

**Check all model was created**

In [0]:
def model_status(model_id):

    is_model_waiting_created = True  
    while(is_model_waiting_created):
        res = requests.get(API_MODEL_STATUS.format(endpoint=ENDPOINT, model_id = model_id), headers=HEADERS)
        assert res.status_code == 200, f"Error occured. Error message: {res.content}"
        model_status = json.loads(res.content)['modelInfo']['status']
       
        if(model_status == "READY"):
            is_model_waiting_created = False
            
        if(model_status == "FAILED"):
            is_model_waiting_created = False
            print(model_status)
            

    return model_status

In [0]:
for row in trained_model_list:
    print(row['station'])
    model_status(row['model_id'])

**Insert trained models metadata**

In [0]:
rdd = spark.sparkContext.parallelize(trained_model_list)
df_trained_model_list = spark.createDataFrame(rdd)

(df_trained_model_list.write
         .format('delta')
         .mode('append')
         .save('/mnt/deltalake/trained_model_log')
)

display(df_trained_model_list)

fab,model_id,model_training_time,period,station,training_endTime,training_startTime
D21,e15cf72a-d5b8-11ec-bf9c-aaf07dc5582f,2022-05-17 08:11:09,W,01a37bc3b6a23ccdaf5b,2022-05-16T00:00:00Z,2018-01-01T00:00:00Z
D21,e922584c-d5b8-11ec-8634-aaf07dc5582f,2022-05-17 08:11:22,W,021512a0fa0166f54dd7,2022-05-16T00:00:00Z,2018-01-01T00:00:00Z
D21,f0e47ede-d5b8-11ec-bf9c-aaf07dc5582f,2022-05-17 08:11:35,W,05f098ac2faab7324ae5,2022-05-16T00:00:00Z,2018-01-01T00:00:00Z
D21,f894f01e-d5b8-11ec-bf9c-aaf07dc5582f,2022-05-17 08:11:48,W,07b8ada8541584b38731,2022-05-16T00:00:00Z,2018-01-01T00:00:00Z
D21,00482006-d5b9-11ec-b5eb-22fd69dbc8e5,2022-05-17 08:12:01,W,0a0da4158168d1679b8a,2022-05-16T00:00:00Z,2018-01-01T00:00:00Z
D21,081a448a-d5b9-11ec-84ae-a60db5108cf6,2022-05-17 08:12:14,W,0b08c67ca54995f4451d,2022-05-16T00:00:00Z,2018-01-01T00:00:00Z
D21,0fd1b2e4-d5b9-11ec-bf9c-aaf07dc5582f,2022-05-17 08:12:27,W,0fb6ba15172179e2f690,2022-05-16T00:00:00Z,2018-01-01T00:00:00Z
D21,178f78ea-d5b9-11ec-bf9c-aaf07dc5582f,2022-05-17 08:12:40,W,15787192a2cf526deaf8,2022-05-16T00:00:00Z,2018-01-01T00:00:00Z
D21,1f449c0a-d5b9-11ec-b5eb-22fd69dbc8e5,2022-05-17 08:12:53,W,1b06cfe8d00e1293ef0c,2022-05-16T00:00:00Z,2018-01-01T00:00:00Z
D21,270f6c6c-d5b9-11ec-b5eb-22fd69dbc8e5,2022-05-17 08:13:06,W,20cc23709cc1ee2e354a,2022-05-16T00:00:00Z,2018-01-01T00:00:00Z


**Query the last models(group by model metadata)**

group by model metadata ( station , fab, period ) to in order to delete the model

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import monotonically_increasing_id, row_number

windowSpec  = Window.partitionBy("station","fab","period").orderBy(F.col("model_training_time").desc())


df_trained_model_list = (spark.read
         .format('delta')
         .load('/mnt/deltalake/trained_model_log')
         .select('station','fab','period','model_training_time','model_id')
         .filter(F.col('fab') == fab )
         .filter(F.col('period') == period )
         .withColumn("id",row_number().over(windowSpec))
         .filter('id == 2')
)

display(df_trained_model_list)

station,fab,period,model_training_time,model_id,id
01a37bc3b6a23ccdaf5b,D21,W,2022-05-17 07:33:13,948a48d0-d5b3-11ec-ac92-a60db5108cf6,2
021512a0fa0166f54dd7,D21,W,2022-05-17 07:33:26,9c39ffda-d5b3-11ec-9ef3-22fd69dbc8e5,2
05f098ac2faab7324ae5,D21,W,2022-05-17 07:33:39,a402f2da-d5b3-11ec-9ef3-22fd69dbc8e5,2
07b8ada8541584b38731,D21,W,2022-05-17 07:33:52,abc73a80-d5b3-11ec-b5eb-22fd69dbc8e5,2
0a0da4158168d1679b8a,D21,W,2022-05-17 07:34:05,b3874684-d5b3-11ec-ac92-a60db5108cf6,2
0b08c67ca54995f4451d,D21,W,2022-05-17 07:34:18,bb3d512a-d5b3-11ec-bf9c-aaf07dc5582f,2
0fb6ba15172179e2f690,D21,W,2022-05-17 07:34:31,c301e560-d5b3-11ec-9ef3-22fd69dbc8e5,2
15787192a2cf526deaf8,D21,W,2022-05-17 07:34:44,cac10632-d5b3-11ec-8634-aaf07dc5582f,2
1b06cfe8d00e1293ef0c,D21,W,2022-05-17 07:34:57,d273957a-d5b3-11ec-bf9c-aaf07dc5582f,2
20cc23709cc1ee2e354a,D21,W,2022-05-17 07:35:10,da3ccd8a-d5b3-11ec-9ef3-22fd69dbc8e5,2


**Delete model**

delet model after we created the new model for each station

In [0]:
model_last_list = df_trained_model_list.select('model_id').rdd.map(lambda x : x.model_id).collect()
for model_id in model_last_list:
    res = requests.delete(API_DELETE.format(endpoint=ENDPOINT, model_id=model_id), headers=HEADERS)
    assert res.status_code == 204, f"Error occured. Error message: {res.content}"
    print(model_id)