<a href="https://colab.research.google.com/github/amrungwaew/cse842/blob/main/jefri66_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
%pip install keras_tuner

In [None]:
import nltk
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense
from tensorflow.keras.optimizers import Adam
from functools import partial
from keras_tuner import RandomSearch, Objective

nltk.download("treebank")
nltk.download("universal_tagset")
from nltk.corpus import treebank

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


In [None]:
tagged_sentences = treebank.tagged_sents(tagset="universal")
goje = pd.Series([len(tagged_sentence) for tagged_sentence in tagged_sentences])
print(goje.describe())

count    3914.000000
mean       25.722024
std        12.868534
min         1.000000
25%        17.000000
50%        25.000000
75%        33.000000
max       271.000000
dtype: float64


In [None]:
def make_tagged_sentence_df(tagged_sentence, max_words):
    """Docstring here."""
    tagged_sentence_df = pd.DataFrame(
        [(word, pos) for word, pos in tagged_sentence],
        columns=["word", "pos"],
        dtype="string",
    ).iloc[:max_words]
    tagged_sentence_df = tagged_sentence_df.assign(
        **{
            "first_word": (tagged_sentence_df.index == 0).astype("float32"),
            "last_word": (tagged_sentence_df.index == len(tagged_sentence) - 1).astype(
                "float32"
            ),
            "first_upper": tagged_sentence_df.word.str.istitle().astype("float32"),
            "all_upper": tagged_sentence_df.word.str.isupper().astype("float32"),
            "is_num": tagged_sentence_df.word.str.isnumeric().astype("float32"),
            "hyphen_present": tagged_sentence_df.word.str.contains("-").astype(
                "float32"
            ),
        }
    )
    num_missing = max_words - tagged_sentence_df.shape[0]
    if num_missing == 0:
        return tagged_sentence_df
    else:
        return pd.concat(
            [
                tagged_sentence_df,
                pd.DataFrame(
                    {
                        col: ["FILLIN" for _ in range(num_missing)]
                        for col in ("word", "pos")
                    }
                    | {
                        col: [0 for _ in range(num_missing)]
                        for col in tagged_sentence_df.columns
                        if col not in ("word", "pos")
                    }
                ).astype(
                    {col: "string" for col in ("word", "pos")}
                    | {
                        col: "float32"
                        for col in tagged_sentence_df.columns
                        if col not in ("word", "pos")
                    }
                ),
            ]
        )


def load_tagged_sentences(max_words):
    """Docstring here."""
    tagged_sentences = treebank.tagged_sents(tagset="universal")
    sentence_dfs_list = [
        make_tagged_sentence_df(tagged_sentence, max_words)
        for tagged_sentence in tagged_sentences
    ]
    return pd.concat(sentence_dfs_list)


def format_for_keras(tagged_sentences_df, features_colnames, max_words):
    """Docstring here."""
    data_dict = {}

    features = (
        tagged_sentences_df[features_colnames]
        .to_numpy()
        .reshape((-1, max_words, len(features_colnames)))
    )

    split_index = int(round(features.shape[0] * 0.8))

    data_dict["train_features"], data_dict["test_features"] = (
        features[:split_index],
        features[split_index:],
    )

    data_dict["nunique_labels"] = tagged_sentences_df.pos.nunique()
    labels = (
        pd.get_dummies(tagged_sentences_df.pos, dtype="float32")
        .to_numpy()
        .reshape((-1, max_words, data_dict["nunique_labels"]))
    )
    data_dict["train_labels"], data_dict["test_labels"] = (
        labels[:split_index],
        labels[split_index:],
    )

    return data_dict


def build_model(nunique_labels, hp):
    """Docstring here."""
    model = Sequential()
    # Tune the number of layers
    for gru_layer in range(hp.Int("num_layers", 1, 3)):
        model.add(
            GRU(
                # Tune number of units separately
                units=hp.Int(
                    f"units_{gru_layer}",
                    min_value=nunique_labels,
                    max_value=100,
                    step=10,
                ),
                activation=hp.Choice("activation", ["relu", "tanh"]),
                return_sequences=True,
            )
        )
    model.add(Dense(nunique_labels, activation="softmax"))

    learning_rate = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log")
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss="categorical_crossentropy",
        metrics=["accuracy"],
    )
    return model


def main():
    max_words = 30
    tagged_sentences_df = load_tagged_sentences(max_words)

    features_colnames = [
        col for col in tagged_sentences_df.columns if col not in ("word", "pos")
    ]
    data_dict = format_for_keras(tagged_sentences_df, features_colnames, max_words)
    print(data_dict["train_features"].shape)
    print(data_dict["train_labels"].shape)
    hypermodel = partial(build_model, data_dict["nunique_labels"])

    tuner = RandomSearch(
        hypermodel,
        max_trials=10,
        objective=Objective("val_accuracy", "max"),
    )
    tuner.search(
        data_dict["train_features"],
        data_dict["train_labels"],
        validation_data=(data_dict["test_features"], data_dict["test_labels"]),
    )
    print(tuner.results_summary())


if __name__ == "__main__":
    main()


Trial 10 Complete [00h 00m 08s]
val_accuracy: 0.3145168125629425

Best val_accuracy So Far: 0.4809706211090088
Total elapsed time: 00h 01m 45s
Results summary
Results in ./untitled_project
Showing 10 best trials
<keras_tuner.engine.objective.Objective object at 0x7fd604e7d340>
Trial summary
Hyperparameters:
num_layers: 3
units_0: 33
activation: tanh
lr: 0.0015498572359259769
units_1: 53
units_2: 43
Score: 0.4809706211090088
Trial summary
Hyperparameters:
num_layers: 2
units_0: 23
activation: tanh
lr: 0.0016431536744526529
units_1: 33
units_2: 33
Score: 0.45913153886795044
Trial summary
Hyperparameters:
num_layers: 1
units_0: 83
activation: tanh
lr: 0.0010104015908082703
Score: 0.4544912874698639
Trial summary
Hyperparameters:
num_layers: 2
units_0: 13
activation: relu
lr: 0.0008166224340086715
units_1: 23
units_2: 93
Score: 0.3629629611968994
Trial summary
Hyperparameters:
num_layers: 1
units_0: 63
activation: relu
lr: 0.0007134180666249963
units_1: 33
units_2: 93
Score: 0.314516812562