In [1]:
!pip install autokeras


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting autokeras
  Downloading autokeras-1.0.20-py3-none-any.whl (162 kB)
[K     |████████████████████████████████| 162 kB 5.0 MB/s 
[?25hCollecting keras-tuner>=1.1.0
  Downloading keras_tuner-1.1.3-py3-none-any.whl (135 kB)
[K     |████████████████████████████████| 135 kB 40.9 MB/s 
Collecting kt-legacy
  Downloading kt_legacy-1.0.4-py3-none-any.whl (9.6 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 34.5 MB/s 
Installing collected packages: jedi, kt-legacy, keras-tuner, autokeras
Successfully installed autokeras-1.0.20 jedi-0.18.1 keras-tuner-1.1.3 kt-legacy-1.0.4


In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.datasets import fetch_california_housing

import autokeras as ak


## A Simple Example
The first step is to prepare your data. Here we use the [California housing
dataset](
https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset)
as an example.


In [3]:

house_dataset = fetch_california_housing()
df = pd.DataFrame(
    np.concatenate(
        (house_dataset.data, house_dataset.target.reshape(-1, 1)), axis=1
    ),
    columns=house_dataset.feature_names + ["Price"],
)
train_size = int(df.shape[0] * 0.9)
df[:train_size].to_csv("train.csv", index=False)
df[train_size:].to_csv("eval.csv", index=False)
train_file_path = "train.csv"
test_file_path = "eval.csv"


The second step is to run the
[StructuredDataRegressor](/structured_data_regressor).
As a quick demo, we set epochs to 10.
You can also leave the epochs unspecified for an adaptive number of epochs.


In [4]:
# Initialize the structured data regressor.
reg = ak.StructuredDataRegressor(
    overwrite=True, max_trials=3
)  # It tries 3 different models.
# Feed the structured data regressor with training data.
reg.fit(
    # The path to the train.csv file.
    train_file_path,
    # The name of the label column.
    "Price",
    epochs=10,
)
# Predict with the best model.
predicted_y = reg.predict(test_file_path)
# Evaluate the best model with testing data.
print(reg.evaluate(test_file_path, "Price"))


Trial 3 Complete [00h 00m 32s]
val_loss: 0.8068180680274963

Best val_loss So Far: 0.8068180680274963
Total elapsed time: 00h 01m 35s
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.5304303169250488, 0.5304303169250488]


## Data Format
The AutoKeras StructuredDataRegressor is quite flexible for the data format.

The example above shows how to use the CSV files directly. Besides CSV files,
it also supports numpy.ndarray, pandas.DataFrame or [tf.data.Dataset](
https://www.tensorflow.org/api_docs/python/tf/data/Dataset?version=stable). The
data should be two-dimensional with numerical or categorical values.

For the regression targets, it should be a vector of numerical values.
AutoKeras accepts numpy.ndarray, pandas.DataFrame, or pandas.Series.

The following examples show how the data can be prepared with numpy.ndarray,
pandas.DataFrame, and tensorflow.data.Dataset.


In [5]:

# x_train as pandas.DataFrame, y_train as pandas.Series
x_train = pd.read_csv(train_file_path)
print(type(x_train))  # pandas.DataFrame
y_train = x_train.pop("Price")
print(type(y_train))  # pandas.Series

# You can also use pandas.DataFrame for y_train.
y_train = pd.DataFrame(y_train)
print(type(y_train))  # pandas.DataFrame

# You can also use numpy.ndarray for x_train and y_train.
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()
print(type(x_train))  # numpy.ndarray
print(type(y_train))  # numpy.ndarray

# Preparing testing data.
x_test = pd.read_csv(test_file_path)
y_test = x_test.pop("Price")

# It tries 10 different models.
reg = ak.StructuredDataRegressor(max_trials=3, overwrite=True)
# Feed the structured data regressor with training data.
reg.fit(x_train, y_train, epochs=10)
# Predict with the best model.
predicted_y = reg.predict(x_test)
# Evaluate the best model with testing data.
print(reg.evaluate(x_test, y_test))


Trial 3 Complete [00h 00m 32s]
val_loss: 0.910956621170044

Best val_loss So Far: 0.910956621170044
Total elapsed time: 00h 01m 32s
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.6027035713195801, 0.6027035713195801]


The following code shows how to convert numpy.ndarray to tf.data.Dataset.


In [6]:
train_set = tf.data.Dataset.from_tensor_slices((x_train, y_train))
test_set = tf.data.Dataset.from_tensor_slices((x_test, y_test))

reg = ak.StructuredDataRegressor(max_trials=3, overwrite=True)
# Feed the tensorflow Dataset to the regressor.
reg.fit(train_set, epochs=10)
# Predict with the best model.
predicted_y = reg.predict(test_set)
# Evaluate the best model with testing data.
print(reg.evaluate(test_set))


Trial 3 Complete [00h 00m 53s]
val_loss: 1.4779330492019653

Best val_loss So Far: 0.886483907699585
Total elapsed time: 00h 02m 42s
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.6080502867698669, 0.6080502867698669]


You can also specify the column names and types for the data as follows.  The
`column_names` is optional if the training data already have the column names,
e.g.  pandas.DataFrame, CSV file.  Any column, whose type is not specified will
be inferred from the training data.


In [7]:
# Initialize the structured data regressor.
reg = ak.StructuredDataRegressor(
    column_names=[
        "MedInc",
        "HouseAge",
        "AveRooms",
        "AveBedrms",
        "Population",
        "AveOccup",
        "Latitude",
        "Longitude",
    ],
    column_types={"MedInc": "numerical", "Latitude": "numerical"},
    max_trials=10,  # It tries 10 different models.
    overwrite=True,
)



## Validation Data
By default, AutoKeras use the last 20% of training data as validation data.  As
shown in the example below, you can use `validation_split` to specify the
percentage.


In [8]:
reg.fit(
    x_train,
    y_train,
    # Split the training data and use the last 15% as validation data.
    validation_split=0.15,
    epochs=10,
)


Trial 10 Complete [00h 00m 32s]
val_loss: 1.7442930936813354

Best val_loss So Far: 0.738528311252594
Total elapsed time: 00h 05m 23s
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fafb3a9c210>

You can also use your own validation set
instead of splitting it from the training data with `validation_data`.


In [9]:
split = 500
x_val = x_train[split:]
y_val = y_train[split:]
x_train = x_train[:split]
y_train = y_train[:split]
reg.fit(
    x_train,
    y_train,
    # Use your own validation set.
    validation_data=(x_val, y_val),
    epochs=10,
)


## Customized Search Space
For advanced users, you may customize your search space by using
[AutoModel](/auto_model/#automodel-class) instead of
[StructuredDataRegressor](/structured_data_regressor). You can configure the
[StructuredDataBlock](/block/#structureddatablock-class) for some high-level
configurations, e.g., `categorical_encoding` for whether to use the
[CategoricalToNumerical](/block/#categoricaltonumerical-class). You can also do
not specify these arguments, which would leave the different choices to be
tuned automatically. See the following example for detail.


In [10]:

input_node = ak.StructuredDataInput()
output_node = ak.StructuredDataBlock(categorical_encoding=True)(input_node)
output_node = ak.RegressionHead()(output_node)
reg = ak.AutoModel(
    inputs=input_node, outputs=output_node, overwrite=True, max_trials=3
)
reg.fit(x_train, y_train, epochs=10)


Trial 3 Complete [00h 00m 04s]
val_loss: 2.899165153503418

Best val_loss So Far: 2.4014809131622314
Total elapsed time: 00h 00m 12s
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fafb9851f10>

The usage of [AutoModel](/auto_model/#automodel-class) is similar to the
[functional API](https://www.tensorflow.org/guide/keras/functional) of Keras.
Basically, you are building a graph, whose edges are blocks and the nodes are
intermediate outputs of blocks.  To add an edge from `input_node` to
`output_node` with `output_node = ak.[some_block]([block_args])(input_node)`.

You can even also use more fine grained blocks to customize the search space
even further. See the following example.


In [11]:

input_node = ak.StructuredDataInput()
output_node = ak.CategoricalToNumerical()(input_node)
output_node = ak.DenseBlock()(output_node)
output_node = ak.RegressionHead()(output_node)
reg = ak.AutoModel(
    inputs=input_node, outputs=output_node, max_trials=3, overwrite=True
)
reg.fit(x_train, y_train, epochs=10)


Trial 3 Complete [00h 00m 03s]
val_loss: 0.8380818963050842

Best val_loss So Far: 0.8380818963050842
Total elapsed time: 00h 00m 09s
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fafb3a74590>

You can also export the best model found by AutoKeras as a Keras Model.


In [12]:
model = reg.export_model()
model.summary()
# numpy array in object (mixed type) is not supported.
# you need convert it to unicode or float first.
model.predict(x_train)



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 8)]               0         
                                                                 
 multi_category_encoding (Mu  (None, 8)                0         
 ltiCategoryEncoding)                                            
                                                                 
 dense (Dense)               (None, 512)               4608      
                                                                 
 re_lu (ReLU)                (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 32)                16416     
                                                                 
 re_lu_1 (ReLU)              (None, 32)                0         
                                                             

array([[2.9577947 ],
       [5.435803  ],
       [3.3932576 ],
       [3.1929822 ],
       [3.0267415 ],
       [2.8155713 ],
       [3.5776715 ],
       [3.598866  ],
       [3.1975522 ],
       [4.166531  ],
       [3.251759  ],
       [3.9233441 ],
       [3.4339185 ],
       [2.549912  ],
       [3.3767972 ],
       [2.7489743 ],
       [3.0922856 ],
       [2.7400098 ],
       [3.1609235 ],
       [2.9552999 ],
       [2.0433583 ],
       [2.7406507 ],
       [3.2082562 ],
       [3.0693326 ],
       [3.3001523 ],
       [2.100657  ],
       [2.6746259 ],
       [3.2536511 ],
       [3.2520566 ],
       [2.509419  ],
       [2.967175  ],
       [3.4083982 ],
       [3.1400952 ],
       [2.7518468 ],
       [3.5252118 ],
       [2.5184407 ],
       [2.9313016 ],
       [2.9366002 ],
       [2.9891667 ],
       [3.621182  ],
       [3.0916867 ],
       [2.4456143 ],
       [2.3391957 ],
       [2.8491254 ],
       [2.9076886 ],
       [2.9181638 ],
       [2.620129  ],
       [2.362

## Reference
[StructuredDataRegressor](/structured_data_regressor),
[AutoModel](/auto_model/#automodel-class),
[StructuredDataBlock](/block/#structureddatablock-class),
[DenseBlock](/block/#denseblock-class),
[StructuredDataInput](/node/#structureddatainput-class),
[RegressionHead](/block/#regressionhead-class),
[CategoricalToNumerical](/block/#categoricaltonumerical-class).
