# Transfer learning in NLP (Transfomers) Encoder
# bert-base-uncased


 bert-base-uncased	110M	English

In [7]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertModel.from_pretrained("bert-base-uncased")

# Define function to create embeddings
def bert_embeddings(texts, max_length=32):
    inputs = tokenizer(
        texts.tolist(),
        return_tensors="tf",
        padding=True,
        truncation=True,
        max_length=max_length
    )
    outputs = bert_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token's embedding
    return cls_embeddings

# Load your dataset
file_path = "https://raw.githubusercontent.com/alexvatti/full-stack-data-science/main/NLP-Exercises/Email-Spam-Classification/spam.csv"
email_df = pd.read_csv(file_path)
X = email_df['Message']  # Column with email text
y = email_df['Category'].apply(lambda x: 1 if x == 'spam' else 0)  # Label encoding for spam/ham

# Split dataset into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert emails to BERT embeddings
X_train_embeddings = bert_embeddings(X_train)
X_test_embeddings = bert_embeddings(X_test)

# Define a simple classifier model
classifier = Sequential([
    Dense(128, activation='relu', input_shape=(768,)),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

# Compile the classifier
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the classifier
classifier.fit(X_train_embeddings, y_train, epochs=5, batch_size=32, validation_split=0.1)

# Evaluate on test set
test_loss, test_accuracy = classifier.evaluate(X_test_embeddings, y_test)
print(f"Test Accuracy: {test_accuracy}")

# Predictions and confusion matrix
y_pred = (classifier.predict(X_test_embeddings) > 0.5).astype("int32")
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

# Save the trained model to a file
classifier.save("spam_classification_model.h5")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8990 - loss: 0.2445 - val_accuracy: 0.9664 - val_loss: 0.1067
Epoch 2/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9817 - loss: 0.0596 - val_accuracy: 0.9709 - val_loss: 0.1280
Epoch 3/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9864 - loss: 0.0436 - val_accuracy: 0.9709 - val_loss: 0.0968
Epoch 4/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9866 - loss: 0.0345 - val_accuracy: 0.9709 - val_loss: 0.0968
Epoch 5/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9906 - loss: 0.0300 - val_accuracy: 0.9709 - val_loss: 0.1046
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9818 - loss: 0.0638
Test Accuracy: 0.9811659455299377
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 



Confusion Matrix:
[[957   9]
 [ 12 137]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.94      0.92      0.93       149

    accuracy                           0.98      1115
   macro avg       0.96      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [8]:
from transformers import BertTokenizer
import tensorflow as tf
from tensorflow.keras.models import load_model



# Function to tokenize and encode the input text
# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertModel.from_pretrained("bert-base-uncased")

# Define function to create embeddings
def bert_embeddings(texts, max_length=32):
    inputs = tokenizer(
        texts,
        return_tensors="tf",
        padding=True,
        truncation=True,
        max_length=max_length
    )
    outputs = bert_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token's embedding
    return cls_embeddings

# Function to make predictions with the trained model
def predict_spam_or_ham(text, model, tokenizer):
    # Preprocess the input text
    cls_embeddings = bert_embeddings([text])
    print(cls_embeddings.shape)

    # Make a prediction (assuming binary classification: spam or ham)
    prediction = model.predict(cls_embeddings)

    # Assuming a binary classification output: 0 or 1
    predicted_class = "Spam" if prediction[0] > 0.5 else "Ham"
    return predicted_class

# Load the saved model
loaded_model = load_model("spam_classification_model.h5")

# Example text input to classify
text_input = "Congratulations! You've won a free gift card. Click here to claim."

# Test the saved model on the input text
result = predict_spam_or_ham(text_input, loaded_model, tokenizer)
print(f"Prediction: {result}")


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

(1, 768)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step
Prediction: Spam
