# Stack Overflow problem

In [1]:
from tensorflow.keras import utils
import pathlib

data_url = 'https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz'

dataset_dir = utils.get_file(
    origin=data_url,
    untar=True,
    cache_dir='stack_overflow',
    cache_subdir='')

dataset_dir = pathlib.Path(dataset_dir).parent

In [3]:
#注意文件夹，只有train和test，valid数据集需从train数据集中分离
train_dir = dataset_dir/'train'
test_dir=dataset_dir/'test'
list(dataset_dir.iterdir())

[WindowsPath('/tmp/.keras/README.md'),
 WindowsPath('/tmp/.keras/stack_overflow_16k.tar.gz'),
 WindowsPath('/tmp/.keras/test'),
 WindowsPath('/tmp/.keras/train')]

In [4]:
batch_size = 32
seed = 42
#拆分训练集和验证集
#保持seed一致，或者都shuffle=false

raw_train_ds = utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

raw_val_ds = utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

raw_test_ds = utils.text_dataset_from_directory(
    test_dir,
    batch_size=batch_size,
    seed=seed)

Found 8000 files belonging to 4 classes.
Using 6400 files for training.
Found 8000 files belonging to 4 classes.
Using 1600 files for validation.
Found 8000 files belonging to 4 classes.


In [16]:
for i, label in enumerate(raw_train_ds.class_names):
    print("Label", i, "corresponds to", label)
    
num_labels=4

Label 0 corresponds to csharp
Label 1 corresponds to java
Label 2 corresponds to javascript
Label 3 corresponds to python


In [9]:
#对数据进行标准化、词例化和向量化
#文本转换为小写，按空格分割，向量化模式为int

from tensorflow.keras.layers import TextVectorization

VOCAB_SIZE =  10000 #词汇量
MAX_SEQUENCE_LENGTH = 250 #最大序列长度

int_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)

In [None]:
#调用 TextVectorization.adapt 以使预处理层的状态适合数据集。这会使模型构建字符串到整数的索引。
train_text = raw_train_ds.map(lambda text, labels: text)
int_vectorize_layer.adapt(train_text)

In [28]:
import tensorflow as tf

def int_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return int_vectorize_layer(text), label

int_train_ds = raw_train_ds.map(int_vectorize_text)
int_val_ds = raw_val_ds.map(int_vectorize_text)
int_test_ds = raw_test_ds.map(int_vectorize_text)

In [30]:
import keras  

model = keras.Sequential([
      keras.layers.Embedding(VOCAB_SIZE, 64, mask_zero=True),
      keras.layers.Conv1D(64, 5, padding="valid", activation="relu", strides=2),
      keras.layers.GlobalMaxPooling1D(),
      keras.layers.Dense(num_labels)
])

In [31]:
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer='adam',
    metrics=['accuracy'])
history = model.fit(int_train_ds, validation_data=int_val_ds, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
