In [None]:
from pyalink.alink import *
useLocalEnv(1)

from utils import *
import os
import pandas as pd

DATA_DIR = ROOT_DIR + "dog_cat" + os.sep

IMAGE_DIR = DATA_DIR + "train" + os.sep

TRAIN_96_FILE = "train_96.ak"
TEST_96_FILE = "test_96.ak"

TRAIN_32_FILE = "train_32.ak"
TEST_32_FILE = "test_32.ak"

MODEL_CNN_FILE = "model_cnn.ak"
MODEL_EFNET_FILE = "model_efnet.ak"
MODEL_EFNET_OFFLINE_FILE = "model_efnet_offline.ak"

PREDICTION_COL = "pred"
PREDICTION_DETAIL_COL = "pred_info"

In [None]:
#c_1

df = pd.DataFrame(os.listdir(IMAGE_DIR))

BatchOperator.fromDataframe(df, schemaStr='relative_path string')\
    .select("relative_path, REGEXP_EXTRACT(relative_path, '(dog|cat)') AS label")\
    .lazyPrint(10)\
    .link(\
        AkSinkBatchOp()\
            .setFilePath(DATA_DIR + "list_all.ak")\
            .setOverwriteSink(True)\
    )
BatchOperator.execute()

splitTrainTestIfNotExist(
    AkSourceBatchOp().setFilePath(DATA_DIR + "list_all.ak"), 
    DATA_DIR + "list_train.ak", DATA_DIR + "list_test.ak", 
    0.9
)

AkSourceStreamOp()\
    .setFilePath(DATA_DIR + "list_train.ak")\
    .link(\
        ReadImageToTensorStreamOp()\
            .setRelativeFilePathCol("relative_path")\
            .setRootFilePath(IMAGE_DIR)\
            .setImageWidth(32)\
            .setImageHeight(32)\
            .setOutputCol("tensor")\
    )\
    .link(\
        AkSinkStreamOp()\
            .setFilePath(DATA_DIR + TRAIN_32_FILE)\
            .setOverwriteSink(True)\
    )
StreamOperator.execute()

AkSourceStreamOp()\
    .setFilePath(DATA_DIR + "list_train.ak")\
    .link(\
        ReadImageToTensorStreamOp()\
            .setRelativeFilePathCol("relative_path")\
            .setRootFilePath(IMAGE_DIR)\
            .setImageWidth(96)
            .setImageHeight(96)\
            .setOutputCol("tensor")\
    )\
    .link(\
        AkSinkStreamOp()\
            .setFilePath(DATA_DIR + TRAIN_96_FILE)\
            .setOverwriteSink(True)\
    )
StreamOperator.execute()

AkSourceStreamOp()\
    .setFilePath(DATA_DIR + "list_test.ak")\
    .link(\
        ReadImageToTensorStreamOp()\
            .setRelativeFilePathCol("relative_path")\
            .setRootFilePath(IMAGE_DIR)\
            .setImageWidth(32)\
            .setImageHeight(32)\
            .setOutputCol("tensor")\
    )\
    .link(\
        AkSinkStreamOp()\
            .setFilePath(DATA_DIR + TEST_32_FILE)\
            .setOverwriteSink(True)\
    )
StreamOperator.execute()

AkSourceStreamOp()\
    .setFilePath(DATA_DIR + "list_test.ak")\
    .link(\
        ReadImageToTensorStreamOp()\
            .setRelativeFilePathCol("relative_path")\
            .setRootFilePath(IMAGE_DIR)\
            .setImageWidth(96)\
            .setImageHeight(96)\
            .setOutputCol("tensor")\
    )\
    .link(\
        AkSinkStreamOp()\
            .setFilePath(DATA_DIR + TEST_96_FILE)\
            .setOverwriteSink(True)\
    )
StreamOperator.execute()

In [None]:
#c_2
def lr(train_set, test_set) :
    Pipeline()\
        .add(\
            TensorToVector()\
                .setSelectedCol("tensor")\
                .setReservedCols(["label"])\
        )\
        .add(\
            LogisticRegression()\
                .setVectorCol("tensor")\
                .setLabelCol("label")\
                .setPredictionCol(PREDICTION_COL)\
                .setPredictionDetailCol(PREDICTION_DETAIL_COL)\
        )\
        .fit(train_set)\
        .transform(test_set)\
        .link(\
            EvalBinaryClassBatchOp()\
                .setLabelCol("label")\
                .setPredictionDetailCol(PREDICTION_DETAIL_COL)\
                .lazyPrintMetrics()\
        )
    
    BatchOperator.execute()

In [None]:
def cnn(train_set, test_set) :
    if not(os.path.exists(DATA_DIR + MODEL_CNN_FILE)):
        train_set\
            .link(
                KerasSequentialClassifierTrainBatchOp()\
                    .setTensorCol("tensor")\
                    .setLabelCol("label")\
                    .setLayers([
                        "Conv2D(32, kernel_size=(3, 3), activation='relu')",
                        "MaxPooling2D(pool_size=(2, 2))",
                        "Conv2D(64, kernel_size=(3, 3), activation='relu')",
                        "MaxPooling2D(pool_size=(2, 2))",
                        "Flatten()",
                        "Dropout(0.5)"
                    ])\
                    .setNumEpochs(50)\
                    .setSaveCheckpointsEpochs(2.0)\
                    .setValidationSplit(0.1)\
                    .setSaveBestOnly(True)\
                    .setBestMetric("auc")\
            )\
            .link(
                AkSinkBatchOp()\
                    .setFilePath(DATA_DIR + MODEL_CNN_FILE)\
            )
        BatchOperator.execute()

    KerasSequentialClassifierPredictBatchOp()\
        .setPredictionCol(PREDICTION_COL)\
        .setPredictionDetailCol(PREDICTION_DETAIL_COL)\
        .setReservedCols(["relative_path", "label"])\
        .linkFrom(
            AkSourceBatchOp().setFilePath(DATA_DIR + MODEL_CNN_FILE),
            test_set
        )\
        .lazyPrint(10)\
        .lazyPrintStatistics()\
        .link(
            EvalBinaryClassBatchOp()\
                .setLabelCol("label")\
                .setPredictionDetailCol(PREDICTION_DETAIL_COL)\
                .lazyPrintMetrics()
        )
    BatchOperator.execute();

In [None]:
#c_2
sw = Stopwatch()
sw.start()

AlinkGlobalConfiguration.setPrintProcessInfo(True)

train_set = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_32_FILE)
test_set = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_32_FILE)

# lr(train_set, test_set)

cnn(train_set, test_set)

sw.stop()
print(sw.getElapsedTimeSpan())

In [None]:
def efficientnet(train_set, test_set) :
    if not(os.path.exists(DATA_DIR + MODEL_EFNET_FILE)):
        train_set\
            .link(
                KerasSequentialClassifierTrainBatchOp()\
                    .setTensorCol("tensor")\
                    .setLabelCol("label")\
                    .setLayers([
                        "hub.KerasLayer('https://hub.tensorflow.google.cn/google/efficientnet/b0/classification/1')",
                        "Flatten()"
                    ])\
                    .setNumEpochs(5)\
                    .setIntraOpParallelism(1)\
                    .setSaveCheckpointsEpochs(0.5)\
                    .setValidationSplit(0.1)\
                    .setSaveBestOnly(True)
                    .setBestMetric("auc")
            )\
            .link(
                AkSinkBatchOp()\
                    .setFilePath(DATA_DIR + MODEL_EFNET_FILE)
            )
        BatchOperator.execute()

    KerasSequentialClassifierPredictBatchOp()\
        .setPredictionCol(PREDICTION_COL)\
        .setPredictionDetailCol(PREDICTION_DETAIL_COL)\
        .setReservedCols(["relative_path", "label"])\
        .linkFrom(
            AkSourceBatchOp().setFilePath(DATA_DIR + MODEL_EFNET_FILE),
            test_set
        )\
        .lazyPrint(10)\
        .lazyPrintStatistics()\
        .link(
            EvalBinaryClassBatchOp()\
                .setLabelCol("label")\
                .setPredictionDetailCol(PREDICTION_DETAIL_COL)\
                .lazyPrintMetrics()
        )
    BatchOperator.execute()

In [None]:
def efficientnet_offline(train_set, test_set) :
    if not(os.path.exists(DATA_DIR + MODEL_EFNET_OFFLINE_FILE)):
        train_set\
            .link(
                KerasSequentialClassifierTrainBatchOp()\
                    .setTensorCol("tensor")\
                    .setLabelCol("label")\
                    .setLayers([
                        "hub.KerasLayer('" + DATA_DIR + "1')",
                        "Flatten()"
                    ])\
                    .setNumEpochs(5)\
                    .setIntraOpParallelism(1)\
                    .setSaveCheckpointsEpochs(0.5)\
                    .setValidationSplit(0.1)\
                    .setSaveBestOnly(True)\
                    .setBestMetric("auc")
            )\
            .link(
                AkSinkBatchOp()\
                    .setFilePath(DATA_DIR + MODEL_EFNET_OFFLINE_FILE)
            )
        BatchOperator.execute()

    KerasSequentialClassifierPredictBatchOp()\
        .setPredictionCol(PREDICTION_COL)\
        .setPredictionDetailCol(PREDICTION_DETAIL_COL)\
        .setReservedCols(["relative_path", "label"])\
        .linkFrom(
            AkSourceBatchOp().setFilePath(DATA_DIR + MODEL_EFNET_OFFLINE_FILE),
            test_set
        )\
        .lazyPrint(10)\
        .lazyPrintStatistics()\
        .link(
            EvalBinaryClassBatchOp()\
                .setLabelCol("label")\
                .setPredictionDetailCol(PREDICTION_DETAIL_COL)\
                .lazyPrintMetrics()
        )
    BatchOperator.execute()

In [None]:
#c_3
sw = Stopwatch()
sw.start()

AlinkGlobalConfiguration.setPrintProcessInfo(True)

train_set = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_96_FILE)
test_set = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_96_FILE)

efficientnet(train_set, test_set)

efficientnet_offline(train_set, test_set)

sw.stop()
print(sw.getElapsedTimeSpan())

In [None]:
source = CsvSourceBatchOp() \
.setFilePath("https://alink-release.oss-cn-beijing.aliyuncs.com/data-files/random_tensor.csv") \
.setSchemaStr("tensor string, label double")

source = ToTensorBatchOp() \
.setSelectedCol("tensor") \
.setTensorDataType("DOUBLE") \
.setTensorShape([200, 3]) \
.linkFrom(source)

trainer = KerasSequentialRegressor() \
.setTensorCol("tensor") \
.setLabelCol("label") \
.setLayers([
"Conv1D(256, 5, padding=‘same’, activation=‘relu’)",
"Conv1D(128, 5, padding=‘same’, activation=‘relu’)",
"Dropout(0.1)",
"MaxPooling1D(pool_size=8)",
"Conv1D(128, 5, padding=‘same’, activation=‘relu’)",
"Conv1D(128, 5, padding=‘same’, activation=‘relu’)",
"Flatten()"
]) \
.setOptimizer("Adam()") \
.setNumEpochs(1) \
.setPredictionCol("pred") \
.setReservedCols(["label"])

model = trainer.fit(source)
prediction = model.transform(source)
prediction.lazyPrint(10)
BatchOperator.execute()

