In [2]:
import sagemaker
import pandas as pd
import boto3

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

prefix = "telco-anomaly-demo"

region = sagemaker_session.boto_region_name
sagemaker_role = sagemaker.get_execution_role()


s3_client = boto3.client("s3", region_name=region)


In [3]:
from sagemaker.feature_store.feature_group import FeatureGroup

anomaly_features = FeatureGroup(name="5gcell-anomaly-features", sagemaker_session=sagemaker_session)

query = anomaly_features.athena_query()

table_name = query.table_name
                       
query_string = f"""
SELECT * FROM "{table_name}"
"""

query.run(query_string=query_string, output_location=f"s3://{bucket}/{prefix}/data/query_results")
query.wait()

dataset = query.as_dataframe()

dataset

Unnamed: 0,health,accessibility,5g_users,contention_rate,utilization,downlink_throughput,uplink_throughput,anomaly,location_id,eventtime,write_time,api_invocation_time,is_deleted
0,0.98,0.98,0.094781,0.004762,0.379834,0.012964,6.626886e-02,1,BAGONGLIPUNANQCNCRN-403_4RFS_None,1.676152e+09,2023-02-11 21:54:19.976,2023-02-11 21:49:22.000,False
1,0.98,0.97,0.010247,0.000454,0.303867,0.000349,4.750786e-03,0,BALUL2N_402,1.676152e+09,2023-02-11 21:54:19.976,2023-02-11 21:49:22.000,False
2,0.98,1.00,0.003522,0.000227,0.279006,0.000009,3.479770e-03,0,BIASONGTLSAYN_402,1.676152e+09,2023-02-11 21:54:19.976,2023-02-11 21:49:22.000,False
3,1.00,1.00,0.095101,0.020635,0.378453,0.064496,2.750947e-02,1,BALUL3M_351,1.676152e+09,2023-02-11 21:54:19.976,2023-02-11 21:49:22.000,False
4,1.00,0.99,0.059878,0.006576,0.309392,0.011944,1.541491e-02,0,BARANGKAN_403_4RFS,1.676152e+09,2023-02-11 21:54:19.976,2023-02-11 21:49:22.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512951,0.98,0.97,0.014089,0.000227,0.158840,0.003965,1.721933e-02,0,11AVER_402_4RFS,1.676148e+09,2023-02-11 20:40:41.618,2023-02-11 20:40:36.000,False
512952,0.94,0.89,0.002562,0.000000,0.139503,0.000354,6.687759e-04,0,BFCLASSICR_402_4RFS,1.676148e+09,2023-02-11 20:40:41.618,2023-02-11 20:40:36.000,False
512953,1.00,0.99,0.052834,0.001134,0.197514,0.006590,2.140727e-02,0,BFCLASSICR_402_4RFS,1.676148e+09,2023-02-11 20:40:41.618,2023-02-11 20:40:36.000,False
512954,1.00,1.00,0.003202,0.000000,0.146409,0.000470,6.154671e-04,0,BATTUNGBDGMALNCRR-401_4RFS_None,1.676148e+09,2023-02-11 20:40:41.618,2023-02-11 20:40:36.000,False


In [4]:
col_order = ["anomaly"] + list(dataset.drop(["location_id", "anomaly", "eventtime", "write_time","api_invocation_time",'is_deleted'], axis=1).columns)

train = dataset.sample(frac=0.80, random_state=0)[col_order]
test = dataset.drop(train.index)[col_order]

In [5]:
test.to_csv("data/test.csv", index=False)
key = f"{prefix}/data/xgboost/test.csv"

s3_client.upload_file(
    Filename="data/test.csv",
    Bucket=bucket,
    Key=key,
)

test_s3_path = f"s3://{bucket}/{key}"
print(f"training data is uploaded to {test_s3_path}")

training data is uploaded to s3://sagemaker-us-west-2-987720697751/telco-anomaly-demo/data/xgboost/test.csv


In [6]:
# from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.xgboost import XGBoostProcessor

xgb = XGBoostProcessor(
    framework_version='1.5-1',
    role=sagemaker_role,
    instance_type='ml.m5.xlarge',
    instance_count=1,
    base_job_name=f'{prefix}-eval',
    sagemaker_session=sagemaker_session,
)
# sklearn_processor = SKLearnProcessor(
#     framework_version="0.23-1",
#     role=sagemaker_role,
#     instance_type="ml.m5.xlarge",
#     instance_count=1,
#     base_job_name=f"{prefix}-eval",
#     sagemaker_session=sagemaker_session,
# )

In [10]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

xgb.run(
    code="pipeline/model_eval.py",
    inputs=[ProcessingInput(source="s3://sagemaker-us-west-2-987720697751/telco-anomaly-pipeline/training_jobs/pipelines-242tyzqnmfdk-XgboostTrain-8XZgfg5DqD/output/model.tar.gz", 
                            destination="/opt/ml/processing/model"),
            ProcessingInput(source=test_s3_path, 
                                    destination="/opt/ml/processing/input/test")
    ],
    outputs=[
        ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"),
    ],
    arguments=["--model-file", "model.tar.gz", "--cutoff-threshold", "0.4"],
)

preprocessing_job_description = xgb.jobs[-1].describe()

INFO:sagemaker.processing:Uploaded None to s3://sagemaker-us-west-2-987720697751/telco-anomaly-demo-eval-2023-02-12-21-25-37-236/source/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-west-2-987720697751/telco-anomaly-demo-eval-2023-02-12-21-25-37-236/source/runproc.sh
INFO:sagemaker:Creating processing-job with name telco-anomaly-demo-eval-2023-02-12-21-25-37-236


........................
[34mExtracting the model[0m
[34mLoad xgboost model....[0m
[34mLoad input data......[0m
[34mMake predictions......[0m
[34mAccuracy: 0.9019212211597508[0m
[34m[[82191  4899]
 [ 5163 10338]][0m


In [9]:
xgb.jobs[-1].describe()

{'ProcessingInputs': [{'InputName': 'input-1',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-987720697751/telco-anomaly-pipeline/training_jobs/pipelines-242tyzqnmfdk-XgboostTrain-8XZgfg5DqD/output/model.tar.gz',
    'LocalPath': '/opt/ml/processing/model',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}},
  {'InputName': 'input-2',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-987720697751/telco-anomaly-demo/data/xgboost/test.csv',
    'LocalPath': '/opt/ml/processing/input/test',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}},
  {'InputName': 'code',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-987720697751/telco-anomaly-demo-eval-2023-02-12-21-15-51-931/source/sourcedir.tar.gz',
    'LocalPath': '/opt/ml/processing/inpu

In [42]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.6.2-py3-none-manylinux2014_x86_64.whl (255.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.9/255.9 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.6.2
[0m

In [57]:
import xgboost as xgb

data = pd.read_csv("data/test.csv")
test = data.drop("anomaly", axis=1)
actual = pd.DataFrame(data["anomaly"])
dtest = xgb.DMatrix(test)

In [46]:
!aws s3 cp s3://sagemaker-us-west-2-987720697751/telco-anomaly-pipeline/training_jobs/pipelines-242tyzqnmfdk-XgboostTrain-8XZgfg5DqD/output/model.tar.gz .

download: s3://sagemaker-us-west-2-987720697751/telco-anomaly-pipeline/training_jobs/pipelines-242tyzqnmfdk-XgboostTrain-8XZgfg5DqD/output/model.tar.gz to ./model.tar.gz


In [47]:
!tar -xf model.tar.gz

In [49]:
import pickle
with open("xgboost-model", "rb") as f:
    booster = pickle.load(f)

  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.

  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.



In [58]:
booster.predict(dtest)

array([9.6791786e-01, 9.8887736e-01, 4.1426792e-05, ..., 2.3674622e-06,
       1.9229356e-03, 2.5797275e-01], dtype=float32)