In [1]:
##Exercise: multi-class classification on Stack Overflow questions
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization


In [None]:
# !rm -rf *
!ls

In [4]:
#get data
url = "https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz"

dataset = tf.keras.utils.get_file(
    "stack_overflow_16k",
    url,
    untar=True,
    cache_dir=".",
    cache_subdir="."
  )

dataset_dir = os.path.join(os.path.dirname(dataset), 'stack_overflow')
dataset_dir

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz


'././stack_overflow'

In [6]:
!mkdir -p stack_overflow/
!mv test stack_overflow/
!mv train stack_overflow/

In [7]:
!ls

README.md  sample_data	stack_overflow	stack_overflow_16k.tar.gz


In [8]:
os.listdir(dataset_dir)

['test', 'train']

In [9]:
train_dir = os.path.join(dataset_dir, "train")
test_dir = os.path.join(dataset_dir, "test")

In [10]:
os.listdir(train_dir)

['python', 'javascript', 'java', 'csharp']

In [11]:
####################TRAINING DATA##############################
sample_train_data1 = os.path.join(train_dir, "python/0.txt")
sample_train_data2 = os.path.join(train_dir, "java/10.txt")
with open (sample_train_data1) as f:
  print(f.read())
with open (sample_train_data2) as f:
  print(f.read())
####################TEST DATA##############################
sample_train_data1 = os.path.join(test_dir, "python/0.txt")
sample_train_data2 = os.path.join(test_dir, "java/10.txt")
with open (sample_train_data1) as f:
  print(f.read())
with open (sample_train_data2) as f:
  print(f.read())   

"is it legal to define two methods with the same name but different returning types? i've written a piece of code to determine a typical palindrome string. i did this by the definition of a reverse() method returning a string. i also eager to have the same method, but in the void form, because of some future needs..as i add the latter to the code, the valid output will become invalid..so, the question is that is it legal to define two methods with the same name but different returning types?.if not, please let me know how to write this code with the void-type method...class detector(object):.    def __init__(self,string):.        self.string = string..    forbidden = (' ','!','?','.','-','_','&amp;','%',""#"","","")..    def eliminator(self):.        for item in self.forbidden:.            if item in self.string:.                self.string = self.string.replace(item,"""")..    def reverse(self):.        return self.string[::-1]            ..    #def reverse(self):.    #    self.string

In [15]:
#load dataset to create raw_training_data
batch_size = 32
seed = 42
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "stack_overflow/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training", #must be training/validation
    seed=seed
)

Found 8000 files belonging to 4 classes.
Using 6400 files for training.


In [16]:
#classes in raw_train_data
print("Label 0 -> ", raw_train_ds.class_names[0])
print("Label 1 -> ", raw_train_ds.class_names[1])
print("Label 2 -> ", raw_train_ds.class_names[2])
print("Label 3 -> ", raw_train_ds.class_names[3])

Label 0 ->  csharp
Label 1 ->  java
Label 2 ->  javascript
Label 3 ->  python


In [17]:
#create validation set

raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "stack_overflow/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=seed
)

Found 8000 files belonging to 4 classes.
Using 1600 files for validation.


In [18]:
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "stack_overflow/test",
    batch_size=batch_size
)

Found 8000 files belonging to 4 classes.


In [19]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [20]:
#Standardize,Tokenize and Vectorize the data

max_features = 10000
sequence_length = 250

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [21]:
#adapt state of preprocess layer to dataset to build index of strings to integers
train_text = raw_test_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [22]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [24]:
# retrieve a batch (of 32 questions and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_question, first_label = text_batch[0], label_batch[0]
print("Question", first_question)
print("Label", raw_test_ds.class_names[first_label])
print("Vectorized Question", vectorize_text(first_question, first_label))

Question tf.Tensor(b'"blank8 why is my solution faster than the neat solution? (hackerrank chocolate feast) edit: simplified my solution..edit: removed opinion based secondary question...background: atarted learning blank a week or two ago using hackerranks problems as exercises and stackoverflow search + google as my teacher, i\'ve had some limited experience learning other languages...i did the exercise my own ""noobish learner way"" which i can\'t help but feel is a ""botched job"" when i see ""neat &amp; short"" solutions...however, when submitting both solutions one after another a couple of times i found the ""neat"" solution was quite a bit slower. ..i vaguely remember something about % operations being costly, is mine faster because of no % operations or is there more to it than just that?..exercise: https://www.hackerrank.com/challenges/chocolate-feast..neat solution from discussion:..import blank.io.*;.import blank.util.*;..public class solution {.    static int cc; .    publ

In [25]:
print("31 ---> ",vectorize_layer.get_vocabulary()[31])
print(" 50 ---> ",vectorize_layer.get_vocabulary()[50])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

31 --->  be
 50 --->  so
Vocabulary size: 10000


In [26]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [27]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [49]:
#create model
vocab_sz = max_features + 1 #index 0 is used for padding
output_labels = 4 #CSHARP,JAVA,JAVASCRIPT,PYTHON
embedding_dim = 64

model = tf.keras.Sequential([
  layers.Embedding(vocab_sz, embedding_dim, mask_zero=True),
  # tf.keras.layers.Dropout(0.2),
  layers.Conv1D(64, 5, padding="valid", activation="relu"),
  layers.GlobalMaxPooling1D(),
  # tf.keras.layers.Dropout(0.2),
  layers.Dense(output_labels)
  ])

model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, None, 64)          640064    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 64)          20544     
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 64)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 4)                 260       
Total params: 660,868
Trainable params: 660,868
Non-trainable params: 0
_________________________________________________________________


In [50]:
#compile
# model.compile(
#     loss = SparseCategoricalCrossentropy(from_logits=True),
#     optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
#     metrics=["accuracy"]
# )

model.compile(loss=losses.SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=tf.metrics.Accuracy(name="accuracy")
)

In [51]:
# epochs = 5
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=5
    )


Epoch 1/5


ValueError: ignored