In [1]:
from pyalink.alink import *
resetEnv()
useLocalEnv(1, config=None)


Use one of the following command to start using pyalink:
使用以下一条命令来开始使用 pyalink：
 - useLocalEnv(parallelism, flinkHome=None, config=None)
 - useRemoteEnv(host, port, parallelism, flinkHome=None, localIp="localhost", config=None)
Call resetEnv() to reset environment and switch to another.
使用 resetEnv() 来重置运行环境，并切换到另一个。

JVM listening on 127.0.0.1:57247


JavaObject id=o6

# 准备数据

In [2]:
schema = "age bigint, workclass string, fnlwgt bigint, education string, \
          education_num bigint, marital_status string, occupation string, \
          relationship string, race string, sex string, capital_gain bigint, \
          capital_loss bigint, hours_per_week bigint, native_country string, label string"

adult_batch = CsvSourceBatchOp() \
    .setFilePath("https://alink-release.oss-cn-beijing.aliyuncs.com/data-files/adult_train.csv") \
    .setSchemaStr(schema)

adult_stream = CsvSourceStreamOp() \
    .setFilePath("https://alink-release.oss-cn-beijing.aliyuncs.com/data-files/adult_test.csv") \
    .setSchemaStr(schema)

# 特征建模

In [3]:
categoricalColNames = ["workclass", "education", "marital_status", "occupation",
                       "relationship", "race", "sex", "native_country"]
numerialColNames = ["age", "fnlwgt", "education_num", "capital_gain",
                    "capital_loss", "hours_per_week"]
onehot = OneHotEncoder().setSelectedCols(categoricalColNames) \
        .setOutputCol("output").setReservedCols(numerialColNames + ["label"])
assembler = VectorAssembler().setSelectedCols(["output"] + numerialColNames) \
        .setOutputCol("vec").setReservedCols(["label"])
pipeline = Pipeline().add(onehot).add(assembler)

# 训练+预测+评估

In [4]:
logistic = LogisticRegression().setVectorCol("vec").setLabelCol("label") \
        .setPredictionCol("pred").setPredictionDetailCol("detail")
model = pipeline.add(logistic).fit(adult_batch)

predictBatch = model.transform(adult_stream)

metrics = EvalBinaryClassBatchOp().setLabelCol("label") \
        .setPredictionDetailCol("detail").linkFrom(predictBatch).collectMetrics()

# 输出评估结果

In [5]:
print("AUC:", metrics.getAuc())
print("KS:", metrics.getKs())
print("PRC:", metrics.getPrc())
print("Precision:", metrics.getPrecision())
print("Recall:", metrics.getRecall())
print("F1:", metrics.getF1())
print("ConfusionMatrix:", metrics.getConfusionMatrix())
print("LabelArray:", metrics.getLabelArray())
print("LogLoss:", metrics.getLogLoss())
print("TotalSamples:", metrics.getTotalSamples())
print("ActualLabelProportion:", metrics.getActualLabelProportion())
print("ActualLabelFrequency:", metrics.getActualLabelFrequency())
print("Accuracy:", metrics.getAccuracy())
print("Kappa:", metrics.getKappa())

AUC: 0.9071346253140332
KS: 0.6508855101121852
PRC: 0.7654668375809972
Precision: 0.7311696264543784
Recall: 0.609105981379926
F1: 0.6645794197453558
ConfusionMatrix: [[4776, 1756], [3065, 22964]]
LabelArray: ['>50K', '<=50K']
LogLoss: 0.31880016560096547
TotalSamples: 32561
ActualLabelProportion: [0.2408095574460244, 0.7591904425539756]
ActualLabelFrequency: [7841, 24720]
Accuracy: 0.8519394367494856
Kappa: 0.5705912048680206
