In [None]:
!pip install -q tf-nightly

[K     |████████████████████████████████| 323.0MB 27kB/s 
[K     |████████████████████████████████| 6.8MB 46.7MB/s 
[K     |████████████████████████████████| 460kB 46.9MB/s 
[?25h

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import os
import re
import shutil
import string

import tensorflow as tf

from tensorflow.keras import layers, models
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

print("Tensorflow version:", tf.__version__)

Tensorflow version: 2.4.0-dev20200720


In [None]:
url = 'http://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz'

dataset = tf.keras.utils.get_file('stack_overflow_16k.tar.gz', url,
                                  untar = True, cache_dir = '.',
                                  cache_subdir = '')
!mkdir stack_overflow_16k
!mv test stack_overflow_16k
!mv train stack_overflow_16k
dataset_dir = os.path.join(os.path.dirname(dataset), 'stack_overflow_16k')

In [None]:
# read few random question
import pprint
pp = pprint.PrettyPrinter(width = 60, compact = True)
train_path = os.path.join(dataset_dir, 'train')
random_lang_train = np.random.choice(os.listdir(train_path))
random_lang_path = os.path.join(train_path, random_lang_train)
random_question = np.random.choice(os.listdir(random_lang_path))
random_question_path = os.path.join(random_lang_path, random_question)
print('Random sample at:', random_question_path)
with open(random_question_path, 'r') as f:
  pp.pprint(f.read())

Random sample at: ./stack_overflow_16k/train/csharp/1359.txt
('"blank simple update query not working sqlconnection '
 'conn = new sqlconnection(@""data '
 'source=saisqlexpress;initial catalog=testing;integrated '
 'security=true;pooling=false"");..conn.open();.sqlcommand '
 'command = new sqlcommand();.string test = ""update '
 'attend year=\'2014\' where id = \'2\'"";.command = new '
 'sqlcommand(test, '
 'conn);.command.executenonquery();.conn.close();...year '
 'and id are both varchar. error is:...  incorrect syntax '
 'near \'year\'."\n')


In [None]:
seed = 42
batch_size = 32
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(train_path,
                                                                  batch_size = batch_size,
                                                                  validation_split = 0.2,
                                                                  subset = 'training',
                                                                  seed = seed,
                                                                  )

Found 8000 files belonging to 4 classes.
Using 6400 files for training.


In [None]:
# print out few examples from dataset
for text_batch, label_batch in raw_train_ds.take(1):
  for i in range(3):
    print('Question:', text_batch.numpy()[i])
    print('Label:', raw_train_ds.class_names[label_batch.numpy()[i]])


Question: b'"my tester is going to the wrong constructor i am new to programming so if i ask a question that can be easily fixed, please forgive me. my program has a tester class with a main. when i send that to my regularpolygon class, it sends it to the wrong constructor. i have two constructors. 1 without perameters..public regularpolygon().    {.       mynumsides = 5;.       mysidelength = 30;.    }//end default constructor...and my second, with perameters. ..public regularpolygon(int numsides, double sidelength).    {.        mynumsides = numsides;.        mysidelength = sidelength;.    }// end constructor...in my tester class i have these two lines:..regularpolygon shape = new regularpolygon(numsides, sidelength);.        shape.menu();...numsides and sidelength were declared and initialized earlier in the testing class...so what i want to happen, is the tester class sends numsides and sidelength to the second constructor and use it in that class. but it only uses the default cons

In [None]:
# create a validation set since original dataset doesn't contain one
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(train_path,
                                                                batch_size = batch_size,
                                                                validation_split = .2,
                                                                subset = 'validation',
                                                                seed = seed)
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    os.path.join(dataset_dir, 'test'),
    batch_size = batch_size
)

Found 8000 files belonging to 4 classes.
Using 1600 files for validation.
Found 8000 files belonging to 4 classes.


In [None]:
max_features = 15000
sequence_length = 500
vectorize_layer = TextVectorization(
    max_tokens = max_features,
    output_mode = 'int',
    output_sequence_length = sequence_length
)

train_text = raw_train_ds.map(lambda x,y: x)
vectorize_layer.adapt(train_text)

In [None]:
def vectorize_text(seq,label):
  text = tf.expand_dims(seq,-1)
  return vectorize_layer(text), label

In [None]:
text_batch, label_batch = next(iter(raw_train_ds))
first_question, first_label = text_batch[0], label_batch[0]
print('Question:', first_question)
print('Label:', first_label)
print('Vectorized question', vectorize_text(text_batch[0],label_batch[0]))

In [None]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size = AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size = AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size = AUTOTUNE)

In [None]:
# now create the final layer
embedding_dims = 24
model = models.Sequential([
  layers.Embedding(max_features + 1, embedding_dims),
  layers.Dropout(.25),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(.25),
  layers.Dense(64, activation = 'relu'),
  layers.Dense(4, activation = 'softmax')
])
model.summary()
model.compile(optimizer = 'adam',
              loss = 'sparse_categorical_crossentropy',
              metrics = ['accuracy'])

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 24)          360024    
_________________________________________________________________
dropout_16 (Dropout)         (None, None, 24)          0         
_________________________________________________________________
global_average_pooling1d_6 ( (None, 24)                0         
_________________________________________________________________
dropout_17 (Dropout)         (None, 24)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 64)                1600      
_________________________________________________________________
dense_12 (Dense)             (None, 4)                 260       
Total params: 361,884
Trainable params: 361,884
Non-trainable params: 0
________________________________________________

In [None]:
epochs = 10
h = model.fit(train_ds, validation_data = val_ds, epochs = epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
fig = go.Figure()
x = list(range(1,epochs+1))
fig.update_layout(title = 'Losses', yaxis_title = 'Loss', xaxis_title = 'Epoch')
fig.add_trace(go.Scatter(x=x, y = h.history['loss'], name = 'training loss'))
fig.add_trace(go.Scatter(x=x, y = h.history['val_loss'], name = 'validation loss'))
fig.show()
fig.data = []
fig.update_layout(title = 'Accuracies', yaxis_title = 'Accuracy', xaxis_title = 'Epoch')
fig.add_trace(go.Scatter(x=x, y = h.history['accuracy'], name = 'training accuracy'))
fig.add_trace(go.Scatter(x=x, y = h.history['val_accuracy'], name = 'validation accuracy'))
fig.show()