In [1]:
!pip install autokeras
!pip install keras-tuner

Collecting autokeras
  Downloading autokeras-1.1.0-py3-none-any.whl (148 kB)
     |████████████████████████████████| 148 kB 6.7 MB/s            
Collecting keras-tuner>=1.1.0
  Downloading keras_tuner-1.3.5-py3-none-any.whl (176 kB)
     |████████████████████████████████| 176 kB 53.4 MB/s            
[?25hCollecting keras-nlp>=0.4.0
  Downloading keras_nlp-0.5.2-py3-none-any.whl (527 kB)
     |████████████████████████████████| 527 kB 64.6 MB/s            
[?25hCollecting tensorflow>=2.8.0
  Downloading tensorflow-2.13.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (524.1 MB)
     |████████████████████████████████| 524.1 MB 13 kB/s              6��████████▋                  | 223.6 MB 90.5 MB/s eta 0:00:04 | 290.8 MB 86.5 MB/s eta 0:00:03
Collecting tensorflow-text
  Downloading tensorflow_text-2.13.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.5 MB)
     |████████████████████████████████| 6.5 MB 67.4 MB/s            
Collecting kt-legacy
  Downloading kt_l

Installing collected packages: google-auth, tensorboard-data-server, protobuf, numpy, grpcio, google-auth-oauthlib, absl-py, tensorflow-io-gcs-filesystem, tensorflow-estimator, tensorboard, libclang, keras, flatbuffers, tensorflow, tensorflow-text, kt-legacy, keras-tuner, keras-nlp, autokeras
  Attempting uninstall: google-auth
    Found existing installation: google-auth 2.3.3
    Uninstalling google-auth-2.3.3:
      Successfully uninstalled google-auth-2.3.3
  Attempting uninstall: tensorboard-data-server
    Found existing installation: tensorboard-data-server 0.6.1
    Uninstalling tensorboard-data-server-0.6.1:
      Successfully uninstalled tensorboard-data-server-0.6.1
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.19.1
    Uninstalling protobuf-3.19.1:
      Successfully uninstalled protobuf-3.19.1
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.4
    Uninstalling numpy-1.21.4:
      Successfully uninstalled numpy-1.21.4


In [18]:
!pip install protobuf==3.20.*



In [8]:
from konlpy.tag import Mecab
from collections import Counter

import pandas as pd
import numpy as np
import re

import tensorflow as tf

tokenizer = Mecab()
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

def data_load():
        
    _dir = '../../data'

    trains = []
    vals = []
    tests = []

    datasets = [
        { 'name': 'preprocessed_train', 'frac': 1 },
        { 'name': 'train_rs', 'frac': 0.4 },
        { 'name': 'train_rd', 'frac': 0.35 }
    ]

    for it in datasets:
        _df = pd.read_csv(f'{_dir}/{it["name"]}.csv')

        _train = _df[:int(len(_df)*0.6)]
        _val = _df[int(len(_df)*0.6):int(len(_df)*0.8)]
        _test = _df[int(len(_df)*0.8):]

        if it["name"] == 'preprocessed_train':
            vals.append(_val)
            tests.append(_test)
        else:
            _train = _train.sample(frac=it['frac'])

        trains.append(_train)

    train = pd.concat(trains)
    train.drop_duplicates(inplace=True)
    val = pd.concat(vals)
    val.drop_duplicates(inplace=True)
    test = pd.concat(tests)
    test.drop_duplicates(inplace=True)

    return train, val, test

def preprocess_sentence(sentence):

        sentence = sentence.lower().strip()
        sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
        sentence = re.sub(r'[" "]+', " ", sentence)
        sentence = re.sub(r"[^a-zA-Z?.!가-힣ㄱ-ㅎㅏ-ㅣ]+", " ", sentence)
        sentence = sentence.strip()

        return sentence
    
    
def tokenize(it):
    result = []
    for sentence in it:
        temp = tokenizer.morphs(sentence)
        temp = [word for word in temp if not word in stopwords] 
        result.append(temp)
    return result


def pad_sequences(it, word_to_index):
    return tf.keras.preprocessing.sequence.pad_sequences(
        it, value=word_to_index["<PAD>"], padding='pre', maxlen=400)


def preprocess():

    train, val, test = data_load()
    train['data'] = train['data'].apply(lambda it : preprocess_sentence(it))
    val['data'] = val['data'].apply(lambda it : preprocess_sentence(it))
    test['data'] = test['data'].apply(lambda it : preprocess_sentence(it))


    x_train = tokenize(train['data'])
    x_test = tokenize(test['data'])
    x_val = tokenize(val['data'])

    y_train = tf.keras.utils.to_categorical(train['label'], num_classes=4, dtype='float32')
    y_val = tf.keras.utils.to_categorical(val['label'], num_classes=4, dtype='float32')
    y_test = tf.keras.utils.to_categorical(test['label'], num_classes=4, dtype='float32')

    words = np.concatenate(x_train).tolist()
    counter = Counter(words)
    counter = counter.most_common(10000-4)
    vocab = ['<PAD>', '<BOS>', '<UNK>', '<UNUSED>'] + [key for key, _ in counter]
    word_to_index = { word:index for index, word in enumerate(vocab) }

#     self. vocab_size = len(word_to_index)

    def wordlist_to_index_list(wordlist):
        return [word_to_index[word] if word in word_to_index else word_to_index['<UNK>'] for word in wordlist]

    x_train = list(map(wordlist_to_index_list, x_train))
    x_test = list(map(wordlist_to_index_list, x_test))
    x_val = list(map(wordlist_to_index_list, x_val))  

    x_train = pad_sequences(x_train, word_to_index)
    x_val = pad_sequences(x_val, word_to_index)
    x_test = pad_sequences(x_test, word_to_index)
    
    return x_train, y_train, x_val, y_val, x_test, y_test

x_train, y_train, x_val, y_val, x_test, y_test = preprocess()

In [19]:
import autokeras as ak

clf = ak.TextClassifier(
    multi_label=True,
    overwrite=True,
    max_trials=3,
    epochs=15)

clf.fit(
    x_train, 
    y_train, 
    validation_data=(x_val, y_val))

TypeError: Descriptors cannot not be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates