# Dependencies

In [82]:
import tensorflow as tf
import os
import pandas as pd
import numpy as np
from tensorflow.keras.layers import TextVectorization

# Data Prepare

In [12]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.shape

(159571, 8)

In [70]:
X = train['comment_text'].values
y = train[train.columns[2:]].values

In [73]:
X.shape, y.shape

((159571,), (159571, 6))

In [75]:
max_feature = 500000
max_len = 1800
tokenizer = TextVectorization(
    max_tokens=max_feature,
    output_mode='int',
    output_sequence_length=max_len
)
tokenizer.adapt(X)
vec_text = tokenizer(X)

### Data Science Pipeline

In [81]:
dataset = tf.data.Dataset.from_tensor_slices((vec_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(16000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [86]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

2023-08-03 00:45:20.704711: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [89]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

# Define Model

In [96]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [97]:
model = Sequential()
# Embedding layer
model.add(Embedding(max_feature+1, 32))

# Bidirectional layer
model.add(Bidirectional(LSTM(32, activation='tanh')))

# hidden layer
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))

# output layer
model.add(Dense(6, activation='sigmoid'))

In [102]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          16000032  
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                      

In [105]:
model.fit(train, epochs=1, validation_data=val)



2023-08-03 01:44:55.724138: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.




<keras.src.callbacks.History at 0x2d1beca90>

# Predict & Evaluate

In [117]:
input_x = tokenizer('I love you!')
model.predict(np.expand_dims(input_x, 0))



array([[0.09935369, 0.0005275 , 0.01238311, 0.00475271, 0.02210822,
        0.00630664]], dtype=float32)

In [118]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [120]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [123]:
for batch in test.as_numpy_iterator():
    x_true, y_true = batch

    yhat = model.predict(x_true)

    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)



In [124]:
print(f"Precision:{pre.result().numpy()}, Accuracy:{acc.result().numpy()}")

Precision:0.7955056428909302, Accuracy:0.4745591878890991
