## 本期的SDK的主要功能包括：
- 对目前的PaiFlow的后端接口的封装, 用户可以通过Session对象暴露的接口操作。
- 用户使用Pipeline/Run class，在SDK端完成Pipeline的拼接，任务提交，运行查看。

- 对于常用的算法组件的封装: 进行中



In [None]:
import sys
sys.path.insert(0, "/Users/liangquan/code/pypai")

In [None]:
import pai
print(pai.__file__)

In [None]:



import time
import yaml
from pai import Session, Pipeline, RunInstance, ProviderAlibabaPAI
from pai.pipeline.parameter import ParameterType

# 默认传的环境参数，在运行时如果env为none，则使用session的default_env
default_env = {"resource": {
        "compute": {
            "max_compute": {
            "accessKey": "3AccessKeySecret",
            "accessId": "AccessKeyId",
                "endpoint": "http://service.cn-shanghai.maxcompute.aliyun.com/api",
                "logViewHost": "http://logview.odps.aliyun.com",
                "odpsProject": "wyl_test",
            },
       }
    }
              }

# 阿里云账号AK信息
config = {
    "access_key_id": "AccessKeyId",
    "access_key_secret": "3AccessKeySecret",
    "region_id": "cn-shanghai",
    "odps_project": "wyl_test",
    }

In [None]:
session = Session(**config)

用户通过search_pipeline搜索服务端的Pipeline
- 模糊搜索PAI提供的，identifier包含`ODPS`的Pipeline

In [None]:
pipelines, total_count = session.search_pipeline(identifier="ODPS", fuzzy=True, provider=ProviderAlibabaPAI, page_size=100)

In [None]:
for pipeline in pipelines:
    print(pipeline["Identifier"], pipeline["Version"], pipeline["Provider"], pipeline["PipelineId"])

In [None]:
print(total_count)

In [None]:
pipelines, total_count = session.search_pipeline(provider=ProviderAlibabaPAI, page_size=100)
for pipeline in pipelines:
    print(pipeline["Identifier"], pipeline["Version"], pipeline["Provider"], pipeline["PipelineId"])

- session可以通过具体的PipelineID拉取Pipeline的信息
- 也可以通过(identifier, provider, version)的3元组信息拉取

In [None]:
session.get_pipeline_by_id(pipelines[0]["PipelineId"])

In [None]:
session.get_pipeline("evaluate-xflow-maxCompute", version="v1", provider=ProviderAlibabaPAI)

- 用户直接运行从服务端拉取的Pipeline


In [None]:
pipeline_info = session.get_pipeline(identifier="dataSource-xflow-maxCompute", provider=ProviderAlibabaPAI, version="v1")
pipeline_id = pipeline_info["PipelineId"]

In [None]:
arguments = {"parameters": [
    {
        "name": "execution",
        "value": {
            "odpsInfoFile": "/share/base/odpsInfo.ini",
            "endpoint": "http://service.cn-shanghai.maxcompute.aliyun.com/api",
            "logViewHost": "http://logview.odps.aliyun.com",
            "odpsProject": "wyl_test",
        },
    },
    {
        "name": "tableName",
        "value": "pai_online_project.wumai_data",
    },
    ]
}

In [None]:
run_id = session.create_pipeline_run("demo_paiflow",
                                      pipeline_id=pipeline_id,
                                      arguments=arguments,
                                      no_confirm_required=True)

In [None]:
run = RunInstance(run_id=run_id, session=session)

###  用户构造复合的Pipeline，推送服务端保存，或是直接运行

In [None]:
def create_simple_composite_pipeline(session):
    """Composite data_source and type_transform pipeline"""

    version = "v%s" % (str(int(time.time() * 1000)))
    
    # 初始化一个Pipeline模板, 声明Pipeline的名称和版本
    # 提供Session对象，用户后续拉取服务端的模板，以及提供当前阿里云主账号信息
    p = Pipeline.new_pipeline(identifier="demo-data-source-type-transform", version=version,
                              session=session)

    
    # 指定需要创建的Pipeline的输入
    execution_input = p.create_input_parameter("execution", 'map', required=True)
    cols_to_double_input = p.create_input_parameter("cols_to_double", str, required=True)
    input_table_name = p.create_input_parameter("table_name", str, required=True)
    # hist_cols_input = p.create_input_parameter("histogram_selected_col_names", str, required=True)
    
    
    # 添加一个odps-data-source step
    data_source_step = p.create_step("dataSource-xflow-maxCompute", provider=ProviderAlibabaPAI, version="v1", name="dataSource")
    
    # 指定dataSource的数据输入来源
    data_source_step.set_arguments(
        execution=execution_input,
        tableName=input_table_name,
    )
    
    
    # 指定typeTransform的数据来源
    type_transform_step = p.create_step("type-transform-xflow-maxCompute", provider=ProviderAlibabaPAI, version="v1", name="typeTransform")
    type_transform_step.set_arguments(
        inputArtifact=data_source_step.outputs["outputArtifact"],
        execution=execution_input,
#         outputTable="type-transform-xflow-ODPS",
        outputTable="pai_temp_123455677_18188283",
        cols_to_double=cols_to_double_input,
    )
    

    
    # 设定pipeline的输出，以及输出的来源
    p.create_output_artifact("outputArtifact", type_transform_step.outputs["outputArtifact"])
    
    return p

In [None]:
p = create_simple_composite_pipeline(session)

In [None]:
p.dot()

In [None]:
p.to_dict()

In [None]:
def args_for_composite_pipeline_1():
    arguments = {"execution":{
            "accessKey": "3AccessKeySecret",
            "accessId": "AccessKeyId",
                "endpoint": "http://service.cn-shanghai.maxcompute.aliyun.com/api",
                "logViewHost": "http://logview.odps.aliyun.com",
                "odpsProject": "wyl_test",
            },
        "cols_to_double":"time,hour,pm2,pm10,so2,co,no2",
        "table_name":"pai_online_project.wumai_data"
    }


    return arguments, None

两种方案提交Pipeline运行.

- 提交Pipeline的Manifest直接创建一个Run运行任务
- 上传Pipeline Manifest到服务端，获得PipelineID之后，通过指定PipelineID运行

In [None]:
arguments, env = args_for_composite_pipeline_1()
# wait为False的话直接返回
run_instance = p.run("demo_temp_pipeline_run", arguments=arguments, wait=False, env=env)

In [None]:
run_instance.get_status()

In [None]:
run_instance.wait(log_outputs=True)

In [None]:
run_instance.get_outputs()

In [None]:
session.list_pipeline_run(page_size=2)

## 一个复杂一点的复合Pipeline的构造

In [None]:
def create_air_quality_prediction(session):
    # version = "v%s" % (str(int(time.time() * 1000)))
    p = Pipeline.new_pipeline("ut-air-quality", version="v1.0.0", session=session)

    execution_input = p.create_input_parameter("execution", "map", required=True)
    cols_to_double_input = p.create_input_parameter("cols_to_double", str, required=True)
    hist_cols_input = p.create_input_parameter("histogram_selected_col_names", str,
                                                required=True)
    sql_input = p.create_input_parameter("sql", str, required=True)
    normalize_cols_input = p.create_input_parameter("normalize_selected_col_names", str,
                                                    required=True)
    fraction_input = p.create_input_parameter("fraction", float, required=True)
    randomforest_feature_cols_input = p.create_input_parameter("randomforest_feature_col_names",
                                                                str, required=True)
    randomforest_label_col_input = p.create_input_parameter("randomforest_label_col_names", str,
                                                            required=True)
    prediction1_feature_col_input = p.create_input_parameter("prediction1_feature_col_names",
                                                                str, required=True)
    prediction1_append_col_input = p.create_input_parameter("prediction1_append_col_names", str,
                                                            required=True)
    prediction1_result_col_input = p.create_input_parameter("prediction1_result_col_names", str,
                                                            required=True)
    prediction1_score_col_input = p.create_input_parameter("prediction1_score_col_names", str,
                                                            required=True)
    prediction1_detail_col_input = p.create_input_parameter("prediction1_detail_col_names", str,
                                                            required=True)

    evaluate1_label_col_input = p.create_input_parameter("evaluate1_label_col_name", str,
                                                            required=True)
    evaluate1_score_col_input = p.create_input_parameter("evaluate1_score_col_name", str,
                                                            required=True)
    evaluate1_positive_label_input = p.create_input_parameter("evaluate1_positive_label", int,
                                                                required=True)
    evaluate1_bin_count_input = p.create_input_parameter("evaluate1_bin_count", int,
                                                            required=True)

    logistic_feature_col_input = p.create_input_parameter(
        "logisticregression_feature_col_names", str,
        required=True)
    logistic_label_col_names = p.create_input_parameter("logisticregression_label_col_names",
                                                        str, required=True)
    logistic_good_value_input = p.create_input_parameter("logisticregression_good_value", int,
                                                            required=True)

    prediction2_feature_col_input = p.create_input_parameter("prediction2_feature_col_names",
                                                                str, required=True)
    prediction2_append_col_input = p.create_input_parameter("prediction2_append_col_names", str,
                                                            required=True)
    prediction2_result_col_input = p.create_input_parameter("prediction2_result_col_names", str,
                                                            required=True)
    prediction2_score_col_input = p.create_input_parameter("prediction2_score_col_names", str,
                                                            required=True)
    prediction2_detail_col_input = p.create_input_parameter("prediction2_detail_col_names", str,
                                                            required=True)

    evaluate2_label_col_input = p.create_input_parameter("evaluate2_label_col_name", str,
                                                            required=True)
    evaluate2_score_col_input = p.create_input_parameter("evaluate2_score_col_name", str,
                                                            required=True)
    evaluate2_positive_label_input = p.create_input_parameter("evaluate2_positive_label", int,
                                                                required=True)
    evaluate2_bin_count_input = p.create_input_parameter("evaluate2_bin_count", int,
                                                            required=True)

    data_source_step = p.create_step("dataSource-xflow-maxCompute",
                                        provider=ProviderAlibabaPAI,
                                        name="dataSource")

    data_source_step.set_arguments(
        execution=execution_input,
        tableName="pai_online_project.wumai_data",
    )

    type_transform_step = p.create_step("type-transform-xflow-maxCompute",
                                        provider=ProviderAlibabaPAI,
                                        name="typeTransform")
    type_transform_step.set_arguments(
        inputArtifact=data_source_step.outputs["outputArtifact"],
        execution=execution_input,
        outputTable="type-transform-xflow-maxCompute",
        cols_to_double=cols_to_double_input,
    )

    histogram_step = p.create_step("histogram-xflow-maxCompute",
                                    provider=ProviderAlibabaPAI,
                                    name="histogram")
    histogram_step.set_arguments(
        inputArtifact=type_transform_step.outputs["outputArtifact"],
        execution=execution_input,
        outputTableName="pai_temp_172808_1779985_1",
        selectedColNames=hist_cols_input,
    )

    sql_step = p.create_step("sql-xflow-maxCompute",
                                provider=ProviderAlibabaPAI,
                                name="sql")
    sql_step.set_arguments(
        inputArtifact1=type_transform_step.outputs["outputArtifact"],
        execution=execution_input,
        outputTable="pai_temp_83935_1099579_1",
        sql=sql_input,
    )

    fe_meta_runner_step = p.create_step("fe-meta-runner-xflow-maxCompute",
                                        provider=ProviderAlibabaPAI,
                                        name="feMetaRunner")

    fe_meta_runner_step.set_arguments(
        inputArtifact=sql_step.outputs["outputArtifact"],
        execution=execution_input,
        outputTable="pai_temp_83935_1099581_1",
        mapTable="pai_temp_83935_1099581_2",
        selectedCols="pm10,so2,co,no2",
        labelCol="_c2",
    )

    normalized_step = p.create_step("normalize-xflow-maxCompute",
                                    provider=ProviderAlibabaPAI,
                                    name="normalize")
    normalized_step.set_arguments(
        inputArtifact=sql_step.outputs["outputArtifact"],
        execution=execution_input,
        outputTableName="pai_temp_83935_1099582_1",
        outputParaTableName="pai_temp_83935_1099582_2",
        selectedColNames=normalize_cols_input,
    )

    split_step = p.create_step("split-xflow-maxCompute",
                                provider=ProviderAlibabaPAI,
                                name="split")
    split_step.set_arguments(
        inputArtifact=normalized_step.outputs["outputArtifact"],
        execution=execution_input,
        output1TableName="pai_temp_83935_1099583_1",
        fraction=fraction_input,
        output2TableName="pai_temp_83935_1199583_1",
    )

    randomforest_step = p.create_step("randomforests-xflow-maxCompute",
                                        provider=ProviderAlibabaPAI,
                                        name="randomforests")
    randomforest_step.set_arguments(
        inputArtifact=split_step.outputs["outputArtifact1"],
        execution=execution_input,
        featureColNames=randomforest_feature_cols_input,
        labelColName=randomforest_label_col_input,
        treeNum=100,
        modelName="xlab_m_random_forests_1099584_v0",
    )

    prediction1_step = p.create_step("prediction-xflow-maxCompute",
                                        provider=ProviderAlibabaPAI,
                                        name="prediction1")
    prediction1_step.set_arguments(
        inputModelArtifact=randomforest_step.outputs["outputArtifact"],
        inputDataSetArtifact=split_step.outputs["outputArtifact2"],
        execution=execution_input,
        outputTableName="pai_temp_83935_1029583_1",
        featureColNames=prediction1_feature_col_input,
        appendColNames=prediction1_append_col_input,
        resultColName=prediction1_result_col_input,
        scoreColName=prediction1_score_col_input,
        detailColName=prediction1_detail_col_input,
    )
    evaluate1_step = p.create_step("evaluate-xflow-maxCompute",
                                    provider=ProviderAlibabaPAI,
                                    name="evaluate1")
    evaluate1_step.set_arguments(
        inputArtifact=prediction1_step.outputs["outputArtifact"],
        execution=execution_input,
        outputDetailTableName="pai_temp_83935_1099586_1",
        outputMetricTableName="pai_temp_83935_1228529_1",
        outputELDetailTableName="pai_temp_83935_1299589_1",
        labelColName=evaluate1_label_col_input,
        scoreColName=evaluate1_score_col_input,
        positiveLabel=evaluate1_positive_label_input,
        binCount=evaluate1_bin_count_input,
    )
    logistic_step = p.create_step("logisticregression-binary-xflow-maxCompute",
                                    provider=ProviderAlibabaPAI,
                                    name="logisticregression")

    logistic_step.set_arguments(
        inputArtifact=split_step.outputs["outputArtifact1"],
        execution=execution_input,
        modelName="xlab_m_logisticregres_1099587_v0",
        featureColNames=logistic_feature_col_input,
        labelColName=logistic_label_col_names,
        goodValue=logistic_good_value_input,
    )

    prediction2_step = p.create_step("prediction-xflow-maxCompute",
                                        provider=ProviderAlibabaPAI,
                                        name="prediction2")
    prediction2_step.set_arguments(
        inputModelArtifact=logistic_step.outputs["outputArtifact"],
        inputDataSetArtifact=split_step.outputs["outputArtifact2"],
        execution=execution_input,
        outputTableName="pai_temp_83935_1099588_1",
        featureColNames=prediction2_feature_col_input,
        appendColNames=prediction2_append_col_input,
        resultColName=prediction2_result_col_input,
        scoreColName=prediction2_score_col_input,
        detailColName=prediction2_detail_col_input,
    )

    evaluate2_step = p.create_step("evaluate-xflow-maxCompute",
                                    provider=ProviderAlibabaPAI,
                                    name="evaluate2")
    evaluate2_step.set_arguments(
        inputArtifact=prediction2_step.outputs["outputArtifact"],
        execution=execution_input,
        outputDetailTableName="pai_temp_83935_1099589_1",
        outputMetricTableName="pai_temp_83935_1428529_1",
        outputELDetailTableName="pai_temp_83935_1199589_1",
        labelColName=evaluate2_label_col_input,
        scoreColName=evaluate2_score_col_input,
        positiveLabel=evaluate2_positive_label_input,
        binCount=evaluate2_bin_count_input,
    )

    p.create_output_artifact("predictionResult",
                                from_=evaluate2_step.outputs["outputDetailArtifact"])
    return p


In [None]:
air_quality_pl = create_air_quality_prediction(session)

In [None]:
air_quality_pl.dot()

In [None]:
Pipeline