# Stack Overflow problem

In [3]:
from tensorflow.keras import utils
import pathlib

data_url = 'https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz'

dataset_dir = utils.get_file(
    origin=data_url,
    untar=True,
    cache_dir='stack_overflow',
    cache_subdir='')

dataset_dir = pathlib.Path(dataset_dir).parent

In [4]:
#注意文件夹，只有train和test，valid数据集需从train数据集中分离
train_dir = dataset_dir/'train'
test_dir=dataset_dir/'test'
list(dataset_dir.iterdir())

[PosixPath('/tmp/.keras/README.md'),
 PosixPath('/tmp/.keras/stack_overflow_16k.tar.gz'),
 PosixPath('/tmp/.keras/test'),
 PosixPath('/tmp/.keras/train')]

## CNN+dense 模型

In [4]:
# text_dataset_from_directory +TextVectorization
batch_size = 32
seed = 42
#拆分训练集和验证集
#保持seed一致，或者都shuffle=false

raw_train_ds = utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

raw_val_ds = utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

raw_test_ds = utils.text_dataset_from_directory(
    test_dir,
    batch_size=batch_size)

Found 8000 files belonging to 4 classes.
Using 6400 files for training.
Found 8000 files belonging to 4 classes.
Using 1600 files for validation.
Found 8000 files belonging to 4 classes.


In [4]:
for i, label in enumerate(raw_train_ds.class_names):
    print("Label", i, "corresponds to", label)
    
num_labels=4

Label 0 corresponds to csharp
Label 1 corresponds to java
Label 2 corresponds to javascript
Label 3 corresponds to python


In [5]:
#对数据进行标准化、词例化和向量化
#文本转换为小写，按空格分割，向量化模式为int

from tensorflow.keras.layers import TextVectorization

VOCAB_SIZE =  10000 #词汇量
MAX_SEQUENCE_LENGTH = 250 #最大序列长度

int_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)

In [6]:
#调用 TextVectorization.adapt 以使预处理层的状态适合数据集。这会使模型构建字符串到整数的索引。
train_text = raw_train_ds.map(lambda text, labels: text)
int_vectorize_layer.adapt(train_text)

#Once the vocabulary is set, the layer can encode text into indices. 
#The tensors of indices are 0-padded to the longest sequence in the batch 
#unless you set a fixed output_sequence_length

In [7]:
#创建数据集
import tensorflow as tf

def int_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return int_vectorize_layer(text), label

int_train_ds = raw_train_ds.map(int_vectorize_text)
int_val_ds = raw_val_ds.map(int_vectorize_text)
int_test_ds = raw_test_ds.map(int_vectorize_text)

In [8]:
import keras  

model = keras.Sequential([
      keras.layers.Embedding(VOCAB_SIZE, 64, mask_zero=True),
      keras.layers.Conv1D(64, 5, padding="valid", activation="relu", strides=2),
      keras.layers.GlobalMaxPooling1D(),
      keras.layers.Dense(num_labels,activation="softmax")
])

In [9]:
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer='adam',
    metrics=['accuracy'])
history = model.fit(int_train_ds, validation_data=int_val_ds, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
model.evaluate(int_test_ds)



[0.5026109218597412, 0.8132500052452087]

## TFHUB 训练方式

In [6]:
import tensorflow_hub as hub
import tensorflow as tf  
from tensorflow.keras import utils
seed=42
#如果用tfhub训练文本，则不能给数据集打批次

hub_train_ds = utils.text_dataset_from_directory(
    train_dir,
    batch_size=None,
    validation_split=0.2,
    subset='training',
    seed=seed)

hub_val_ds = utils.text_dataset_from_directory(
    train_dir,
    batch_size=None,
    validation_split=0.2,
    subset='validation',
    seed=seed)

hub_test_ds = utils.text_dataset_from_directory(
    test_dir,
    batch_size=None)


embedding = "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)


Found 8000 files belonging to 4 classes.
Using 6400 files for training.


2024-05-11 18:48:57.610463: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-11 18:48:58.646811: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-11 18:48:58.647595: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-11 18:48:58.661400: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-11 18:48:58.662211: I external/local_xla/xla/stream_executor

Found 8000 files belonging to 4 classes.
Using 1600 files for validation.
Found 8000 files belonging to 4 classes.


In [9]:
AUTOTUNE = tf.data.AUTOTUNE

hub_train_ds = hub_train_ds.cache().prefetch(buffer_size=AUTOTUNE)
hub_val_ds = hub_val_ds.cache().prefetch(buffer_size=AUTOTUNE)
hub_test_ds = hub_test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [10]:
import keras 
import numpy as np

keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

modelhub = keras.Sequential()
modelhub.add(hub_layer)
modelhub.add(tf.keras.layers.Dropout(rate=0.2))
modelhub.add(tf.keras.layers.Dense(64))
modelhub.add(keras.layers.Dropout(rate=0.2))
modelhub.add(keras.layers.Dense(4,activation="softmax"))

modelhub.compile(
    loss="sparse_categorical_crossentropy",
    optimizer='adam',
    metrics=['accuracy'])

early_stopping_cb = keras.callbacks.EarlyStopping(patience=10,restore_best_weights=True)

callbacks = [early_stopping_cb]

#训练时候再打批次

history = modelhub.fit(hub_train_ds.shuffle(10000).batch(32),
                    epochs=100,
                    validation_data=hub_val_ds.batch(128),
                    callbacks=callbacks)

Epoch 1/100


2024-05-11 18:50:01.996578: I external/local_xla/xla/service/service.cc:168] XLA service 0x559a43956c30 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-05-11 18:50:01.996619: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3070 Ti, Compute Capability 8.6
2024-05-11 18:50:02.126871: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-05-11 18:50:03.273380: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
I0000 00:00:1715424603.624034     204 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


In [11]:
#评估预测也要额外打批
modelhub.evaluate(hub_test_ds.batch(512))



[0.42287933826446533, 0.8393750190734863]

In [14]:
np.argmax(modelhub.predict(hub_test_ds.batch(512)),axis=1)



array([0, 1, 3, ..., 0, 0, 0])