In [20]:
import pandas as pd

# Contoh dataframe

df = pd.read_csv('cleaned-text.csv')
df['label'] = df['label'].astype(int)
df.dtypes

label            int32
cleaned_text    object
dtype: object

In [23]:
# Preprocess text
import re 
# Preprocess text
def preprocess_text(text):
    if isinstance(text, str):  # Check if text is a string
        # Lowercase text
        text = text.lower()
        # Remove special characters
        text = re.sub(r'[^a-zA-Z\s]', '', text)
    else:
        text = str(text)  # Convert non-string values to string
    return text

df['cleaned_text'] = df['cleaned_text'].apply(preprocess_text)

# Remove rows with empty or NaN text
df = df.dropna(subset=['cleaned_text'])

============================

In [27]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalMaxPooling1D
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from transformers import TFBertForSequenceClassification

# Split dataset into train, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['cleaned_text'], df['label'], test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input texts
train_encodings = tokenizer(train_texts.tolist(), padding=True, truncation=True, max_length=512)
val_encodings = tokenizer(val_texts.tolist(), padding=True, truncation=True, max_length=512)
test_encodings = tokenizer(test_texts.tolist(), padding=True, truncation=True, max_length=512)

# Convert encodings to TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(len(train_texts)).batch(16)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(16)



In [45]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalMaxPooling1D, BatchNormalization
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = TFBertForSequenceClassification.from_pretrained(model_name)


# Define feature extraction layer
input_ids = Input(shape=(512,), dtype=tf.int32)
outputs = bert_model(input_ids)[1]  # Mengambil output dari lapisan pooling
pooled_output = GlobalMaxPooling1D()(outputs)  # Pooling layer


# Add dense layers with dropout and batch normalization
dense_layer1 = Dense(512, activation='relu', kernel_regularizer=l2(0.001))(pooled_output)
dropout1 = Dropout(0.5)(dense_layer1)
batch_norm1 = BatchNormalization()(dropout1)
dense_layer2 = Dense(256, activation='relu', kernel_regularizer=l2(0.001))(batch_norm1)
dropout2 = Dropout(0.5)(dense_layer2)
batch_norm2 = BatchNormalization()(dropout2)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: Exception encountered when calling layer 'embeddings' (type TFBertEmbeddings).

Could not build a TypeSpec for name: "tf.debugging.assert_less_10/assert_less/Assert/Assert"
op: "Assert"
input: "tf.debugging.assert_less_10/assert_less/All"
input: "tf.debugging.assert_less_10/assert_less/Assert/Assert/data_0"
input: "tf.debugging.assert_less_10/assert_less/Assert/Assert/data_1"
input: "tf.debugging.assert_less_10/assert_less/Assert/Assert/data_2"
input: "Placeholder"
input: "tf.debugging.assert_less_10/assert_less/Assert/Assert/data_4"
input: "tf.debugging.assert_less_10/assert_less/y"
attr {
  key: "T"
  value {
    list {
      type: DT_STRING
      type: DT_STRING
      type: DT_STRING
      type: DT_INT32
      type: DT_STRING
      type: DT_INT32
    }
  }
}
attr {
  key: "summarize"
  value {
    i: 3
  }
}
 of unsupported type <class 'tensorflow.python.framework.ops.Operation'>.

Call arguments received by layer 'embeddings' (type TFBertEmbeddings):
  • input_ids=<KerasTensor: shape=(None, 512) dtype=int32 (created by layer 'input_12')>
  • position_ids=None
  • token_type_ids=<KerasTensor: shape=(None, 512) dtype=int32 (created by layer 'tf.fill_21')>
  • inputs_embeds=None
  • past_key_values_length=0
  • training=False

In [None]:
# Output layer
output_layer = Dense(3, activation='softmax')(batch_norm2)  # 3 classes: anxiety, depression, lonely

# Build the model
model = tf.keras.Model(inputs=input_ids, outputs=output_layer)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Define callback to print epoch progress
class PrintEpochProgress(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f'Epoch {epoch+1}/{num_epochs}, loss: {logs["loss"]}, accuracy: {logs["accuracy"]}, val_loss: {logs["val_loss"]}, val_accuracy: {logs["val_accuracy"]}')

# Train the model with epoch progress callback
num_epochs = 3
history = model.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset, callbacks=[PrintEpochProgress()])

# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_dataset)
print('Test accuracy:', test_accuracy)