In [None]:
!pip install transformers
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel


In [None]:
# Load BERT tokenizer and model
bert_model_name = 'aubmindlab/bert-base-arabertv2'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = TFBertModel.from_pretrained(bert_model_name)


In [None]:
!pip install arabert
from arabert.preprocess import ArabertPreprocessor

arabert_prep = ArabertPreprocessor(model_name=bert_model_name)
input_text = " لاشتري فلوس مني والله اننو حقيقي"

# Tokenize input text
tokens = tokenizer(input_text, padding='max_length',
                   truncation=True, return_tensors='tf')
res = arabert_prep.preprocess(input_text)
print(tokens)

In [None]:
print(type(bert_model))

In [None]:
bert_output = bert_model(**tokens).last_hidden_state
print(bert_output)


In [None]:
tokens['input_ids'].shape


In [None]:
num_classes=2

# Add the pre-trained BERT layer
# Define inputs
input_ids = tf.keras.layers.Input(
    shape=(tokenizer.model_max_length,), dtype=tf.int32)

# BERT layer
bert_output = bert_model(input_ids).last_hidden_state

# Create the model
outputs = tf.keras.layers.GlobalAveragePooling1D()(
    bert_output)  # Example pooling layer
outputs = tf.keras.layers.Dense(units=128, activation='relu')(outputs)
outputs = tf.keras.layers.Dense(
    units=num_classes, activation='softmax')(outputs)

# Full model
model = tf.keras.Model(inputs=input_ids, outputs=outputs)

# Freeze the BERT layers
bert_model.trainable = False

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])

# Print model summary
model.summary()



In [None]:
print(tokenizer.model_max_length)


In [None]:
import pandas as pd
import numpy as np

In [None]:
ds = pd.read_csv('https://raw.githubusercontent.com/amrm3lm/cyborg-devs/main/combined_cleaned_data.csv')
ds.head()

In [None]:
ds = ds[['clean_text','spam']]
ds.head()

In [None]:
ds['clean_text'] = ds['clean_text'].map(eval)

In [None]:
ds['clean_text'] = ds['clean_text'].map(lambda x: " ".join(x)).map(lambda x : tokenizer(x, padding='max_length',
                                                                             truncation=True, return_tensors='tf'))



In [None]:
clean_np = np.array(ds['clean_text'].map(lambda x: np.array(x.input_ids)).to_list())
clean_np = clean_np.reshape((90548,512))
print(clean_np.shape)

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

train_data, test_data, train_labels, test_labels = train_test_split(clean_np, ds['spam'], test_size=0.2, random_state=42)
test_dataset = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(test_data), tf.convert_to_tensor(test_labels)))


In [None]:
x=tf.convert_to_tensor(train_data)
y = tf.one_hot(tf.convert_to_tensor(train_labels), depth=num_classes)
print(x.shape)

In [None]:
epochs = 10
tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=1,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=False,
    start_from_epoch=3
)

history = model.fit(
    x=x,
    y=y,
    validation_split=0.2,
    epochs=epochs,
    verbose=1,
    batch_size=2)


In [None]:
loss, accuracy = model.evaluate(test_dataset)

print("Loss: ", loss)
print("Accuracy: ", accuracy)


In [None]:
model.save('arabertv1_0')


In [None]:
!pip install tensorflow-gpu
tf.config.list_physical_devices(
    device_type=None
)


In [None]:
%tensorflow_version 2.x

import tensorflow as tf
print(tf.__version__)
print(tf.test.gpu_device_name())
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))