In [None]:
from pyalink.alink import *
useLocalEnv(1)

from utils import *
import os
import pandas as pd

pd.set_option('display.max_colwidth', 1000)

DATA_DIR = ROOT_DIR + "iris" + os.sep

ORIGIN_FILE = "iris.data";

TRAIN_FILE = "train.ak";
TEST_FILE = "test.ak";

SCHEMA_STRING = "sepal_length double, sepal_width double, petal_length double, petal_width double, category string"

FEATURE_COL_NAMES = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

LABEL_COL_NAME = "category";

PREDICTION_COL_NAME = "pred";
PRED_DETAIL_COL_NAME = "pred_info";


In [None]:
#c_1
source = CsvSourceBatchOp()\
    .setFilePath(DATA_DIR + ORIGIN_FILE)\
    .setSchemaStr(SCHEMA_STRING);

source\
    .lazyPrint(5, "origin file")\
    .lazyPrintStatistics("stat of origin file")\
    .link(
        CorrelationBatchOp()\
            .setSelectedCols(FEATURE_COL_NAMES)\
            .lazyPrintCorrelation()
    );

source.groupBy(LABEL_COL_NAME, LABEL_COL_NAME + ", COUNT(*) AS cnt").lazyPrint(-1);

BatchOperator.execute();

splitTrainTestIfNotExist(source, DATA_DIR + TRAIN_FILE, DATA_DIR + TEST_FILE, 0.9);


In [None]:
#c_2
train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);

trainer = NaiveBayesTrainBatchOp()\
    .setFeatureCols(FEATURE_COL_NAMES)\
    .setLabelCol(LABEL_COL_NAME);

predictor = NaiveBayesPredictBatchOp()\
    .setPredictionCol(PREDICTION_COL_NAME)\
    .setPredictionDetailCol(PRED_DETAIL_COL_NAME);

train_data.link(trainer);

predictor.linkFrom(trainer, test_data);

trainer.lazyPrintModelInfo();

predictor.lazyPrint(1, "< Prediction >");

predictor\
    .link(
        EvalMultiClassBatchOp()\
            .setLabelCol(LABEL_COL_NAME)\
            .setPredictionCol(PREDICTION_COL_NAME)\
            .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\
            .lazyPrintMetrics("NaiveBayes")
    );

BatchOperator.execute();

In [None]:
#c_3
train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);

OneVsRest()\
    .setClassifier(
        LogisticRegression()\
            .setFeatureCols(FEATURE_COL_NAMES)\
            .setLabelCol(LABEL_COL_NAME)\
            .setPredictionCol(PREDICTION_COL_NAME)\
    )\
    .setNumClass(3)\
    .fit(train_data)\
    .transform(test_data)\
    .link(
        EvalMultiClassBatchOp()\
            .setLabelCol(LABEL_COL_NAME)\
            .setPredictionCol(PREDICTION_COL_NAME)\
            .lazyPrintMetrics("OneVsRest_LogisticRegression")
    );

OneVsRest()\
    .setClassifier(
        GbdtClassifier()\
            .setFeatureCols(FEATURE_COL_NAMES)\
            .setLabelCol(LABEL_COL_NAME)
            .setPredictionCol(PREDICTION_COL_NAME)\
    )\
    .setNumClass(3)\
    .fit(train_data)\
    .transform(test_data)\
    .link(
        EvalMultiClassBatchOp()\
            .setLabelCol(LABEL_COL_NAME)\
            .setPredictionCol(PREDICTION_COL_NAME)\
            .lazyPrintMetrics("OneVsRest_GBDT")
    );

OneVsRest()\
    .setClassifier(
        LinearSvm()\
            .setFeatureCols(FEATURE_COL_NAMES)\
            .setLabelCol(LABEL_COL_NAME)
            .setPredictionCol(PREDICTION_COL_NAME)\
    )\
    .setNumClass(3)\
    .fit(train_data)\
    .transform(test_data)\
    .link(
        EvalMultiClassBatchOp()\
            .setLabelCol(LABEL_COL_NAME)\
            .setPredictionCol(PREDICTION_COL_NAME)\
            .lazyPrintMetrics("OneVsRest_LinearSvm")
    );

BatchOperator.execute()

In [None]:
#c_4
train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);

Softmax()\
    .setFeatureCols(FEATURE_COL_NAMES)\
    .setLabelCol(LABEL_COL_NAME)\
    .setPredictionCol(PREDICTION_COL_NAME)\
    .enableLazyPrintTrainInfo()\
    .enableLazyPrintModelInfo()\
    .fit(train_data)\
    .transform(test_data)\
    .link(
        EvalMultiClassBatchOp()\
            .setLabelCol(LABEL_COL_NAME)\
            .setPredictionCol(PREDICTION_COL_NAME)\
            .lazyPrintMetrics("Softmax")
    );

BatchOperator.execute();


In [None]:
#c_5
train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);
test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);

MultilayerPerceptronClassifier()\
    .setLayers([4, 12, 3])\
    .setFeatureCols(FEATURE_COL_NAMES)\
    .setLabelCol(LABEL_COL_NAME)\
    .setPredictionCol(PREDICTION_COL_NAME)\
    .fit(train_data)\
    .transform(test_data)\
    .link(
        EvalMultiClassBatchOp()\
            .setLabelCol(LABEL_COL_NAME)\
            .setPredictionCol(PREDICTION_COL_NAME)\
            .lazyPrintMetrics("MultilayerPerceptronClassifier [4, 12, 3]")
    );

MultilayerPerceptronClassifier()\
    .setLayers([4, 3])\
    .setFeatureCols(FEATURE_COL_NAMES)\
    .setLabelCol(LABEL_COL_NAME)\
    .setPredictionCol(PREDICTION_COL_NAME)\
    .fit(train_data)\
    .transform(test_data)\
    .link(
        EvalMultiClassBatchOp()\
            .setLabelCol(LABEL_COL_NAME)\
            .setPredictionCol(PREDICTION_COL_NAME)\
            .lazyPrintMetrics("MultilayerPerceptronClassifier [4, 3]")
    );

BatchOperator.execute();
