In [1]:
from pyalink.alink import *
resetEnv()
useLocalEnv(1, config=None)


Use one of the following commands to start using PyAlink:
 - useLocalEnv(parallelism, flinkHome=None, config=None)
 - useRemoteEnv(host, port, parallelism, flinkHome=None, localIp="localhost", config=None)
Call resetEnv() to reset environment and switch to another.

JVM listening on 127.0.0.1:64158


MLEnv(benv=<pyflink.dataset.execution_environment.ExecutionEnvironment object at 0x120796f60>, btenv=<pyflink.table.table_environment.BatchTableEnvironment object at 0x111e94dd8>, senv=<pyflink.datastream.stream_execution_environment.StreamExecutionEnvironment object at 0x120796d68>, stenv=<pyflink.table.table_environment.StreamTableEnvironment object at 0x1208069b0>)

# 准备数据

In [2]:
schema = "age bigint, workclass string, fnlwgt bigint, education string, \
          education_num bigint, marital_status string, occupation string, \
          relationship string, race string, sex string, capital_gain bigint, \
          capital_loss bigint, hours_per_week bigint, native_country string, label string"

adult_batch = CsvSourceBatchOp() \
    .setFilePath("https://alink-release.oss-cn-beijing.aliyuncs.com/data-files/adult_train.csv") \
    .setSchemaStr(schema)

adult_stream = CsvSourceStreamOp() \
    .setFilePath("https://alink-release.oss-cn-beijing.aliyuncs.com/data-files/adult_test.csv") \
    .setSchemaStr(schema)

# 特征建模

In [3]:
categoricalColNames = ["workclass", "education", "marital_status", "occupation",
                       "relationship", "race", "sex", "native_country"]
numerialColNames = ["age", "fnlwgt", "education_num", "capital_gain",
                    "capital_loss", "hours_per_week"]
onehot = OneHotEncoder().setSelectedCols(categoricalColNames) \
        .setOutputCols(["output"]).setReservedCols(numerialColNames + ["label"])
assembler = VectorAssembler().setSelectedCols(["output"] + numerialColNames) \
        .setOutputCol("vec").setReservedCols(["label"])
pipeline = Pipeline().add(onehot).add(assembler)

# 训练+预测+评估

In [4]:
logistic = LogisticRegression().setVectorCol("vec").setLabelCol("label") \
        .setPredictionCol("pred").setPredictionDetailCol("detail")
model = pipeline.add(logistic).fit(adult_batch)

predictBatch = model.transform(adult_batch)

metrics = EvalBinaryClassBatchOp().setLabelCol("label") \
        .setPredictionDetailCol("detail").linkFrom(predictBatch).collectMetrics()

# 输出评估结果

In [5]:
print("AUC:", metrics.getAuc())
print("KS:", metrics.getKs())
print("PRC:", metrics.getPrc())
print("Precision:", metrics.getPrecision())
print("Recall:", metrics.getRecall())
print("F1:", metrics.getF1())
print("ConfusionMatrix:", metrics.getConfusionMatrix())
print("LabelArray:", metrics.getLabelArray())
print("LogLoss:", metrics.getLogLoss())
print("TotalSamples:", metrics.getTotalSamples())
print("ActualLabelProportion:", metrics.getActualLabelProportion())
print("ActualLabelFrequency:", metrics.getActualLabelFrequency())
print("Accuracy:", metrics.getAccuracy())
print("Kappa:", metrics.getKappa())

AUC: 0.9066240193960077
KS: 0.6495268264606959
PRC: 0.7662328278289783
Precision: 0.733230531996916
Recall: 0.6064277515623008
F1: 0.6638280050258272
ConfusionMatrix: [[4755, 1730], [3086, 22990]]
LabelArray: ['>50K', '<=50K']
LogLoss: 0.3192012545654014
TotalSamples: 32561
ActualLabelProportion: [0.2408095574460244, 0.7591904425539756]
ActualLabelFrequency: [7841, 24720]
Accuracy: 0.8520929946868954
Kappa: 0.5701036372627706
