<a href="https://colab.research.google.com/github/alzeem11/ASTD/blob/master/ML_MarBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

ML +MarBert

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
import numpy as np
import pandas as pd

# Load MARBERT Tokenizer and Model
model_name = 'UBC-NLP/MARBERT'
tokenizer = AutoTokenizer.from_pretrained(model_name)
marbert_model = TFAutoModel.from_pretrained(model_name)

# Preprocessing Function
def preprocess_text(text):
    text = text.replace("\n", " ").strip()
    return text

# Load and Preprocess Dataset
file_path = 'E_dataset1.xlsx'
sheet_name = 'A'
data = pd.read_excel(file_path, sheet_name=sheet_name)
data['Tweet'] = data['Tweet'].fillna("").apply(preprocess_text)
data['Labled'] = data['Labled'].map({'Positive': 1, 'Negative': 0})

# Generate Tweet-Level MARBERT Embeddings and Save to Disk
def generate_tweet_level_embeddings(texts, labels, batch_size=32, max_length=64, output_dir="/content/"):
    tweet_embeddings_list = []
    label_list = []

    # Process data in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_labels = labels[i:i + batch_size]

        # Tokenize with consistent padding
        tokenized = tokenizer(
            batch_texts.tolist(),
            padding="max_length",  # Ensures all sequences are padded to `max_length`
            truncation=True,
            max_length=max_length,
            return_tensors="tf"
        )

        # Generate embeddings
        outputs = marbert_model(**tokenized)
        token_embeddings = outputs.last_hidden_state.numpy()  # Shape: (batch_size, max_length, hidden_size)

        # Compute tweet-level embeddings by averaging token embeddings
        tweet_embeddings = np.mean(token_embeddings, axis=1)  # Shape: (batch_size, hidden_size)

        tweet_embeddings_list.append(tweet_embeddings)
        label_list.extend(batch_labels)

    # Concatenate results
    tweet_embeddings = np.concatenate(tweet_embeddings_list, axis=0)  # Shape: (num_tweets, hidden_size)
    labels = np.array(label_list)  # Shape: (num_tweets,)

    # Save embeddings and labels to .npy files
    np.save(f"{output_dir}tweet_embeddings.npy", tweet_embeddings)
    np.save(f"{output_dir}labels.npy", labels)

    print(f"Tweet-level embeddings and labels saved to {output_dir}")

# Generate and save embeddings
generate_tweet_level_embeddings(
    texts=data['Tweet'],
    labels=data['Labled'],
    batch_size=32,
    max_length=64,
    output_dir="/content/"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/652M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at UBC-NLP/MARBERT.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Tweet-level embeddings and labels saved to /content/


In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load embeddings and labels
tweet_embeddings = np.load("/content/embeddings.npy")  # Precomputed embeddings
labels = np.load("/content/labels.npy"

# Evaluate classifiers
def evaluate_and_plot(tweet_embeddings, labels):
    # Classifiers
    classifiers = {
        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    }

    # Cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for name, model in classifiers.items():
        print(f"Evaluating {name}...")
        fold_accuracies, fold_precisions, fold_recalls, fold_f1s = [], [], [], []

        for train_idx, test_idx in cv.split(tweet_embeddings, labels):
            X_train, X_test = tweet_embeddings[train_idx], tweet_embeddings[test_idx]
            y_train, y_test = labels[train_idx], labels[test_idx]

            # Train the model
            model.fit(X_train, y_train)

            # Predict
            y_pred = model.predict(X_test)

            # Calculate metrics for each fold
            fold_accuracies.append(accuracy_score(y_test, y_pred))
            fold_precisions.append(precision_score(y_test, y_pred, average='weighted'))
            fold_recalls.append(recall_score(y_test, y_pred, average='weighted'))
            fold_f1s.append(f1_score(y_test, y_pred, average='weighted'))

        # Print average metrics for the classifier
        print(f"\nAverage metrics for {name}:")
        print(f"Accuracy: {np.mean(fold_accuracies):.5f}")
        print(f"Precision: {np.mean(fold_precisions):.5f}")
        print(f"Recall: {np.mean(fold_recalls):.5f}")
        print(f"F1-Score: {np.mean(fold_f1s):.5f}")


Evaluating Logistic Regression...

Average metrics for Logistic Regression:
Accuracy: 0.93389
Precision: 0.93420
Recall: 0.93389
F1-Score: 0.93361


In [None]:
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load embeddings and labels

tweet_embeddings = np.load("/content/embeddings.npy")  # Precomputed embeddings
labels = np.load("/content/labels.npy"

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Support Vector Machine Evaluation
print("\nEvaluating Support Vector Machine (SVM)...")
svm = SVC(kernel='linear', random_state=42)
svm_accuracies, svm_precisions, svm_recalls, svm_f1s = [], [], [], []

for train_idx, test_idx in cv.split(tweet_embeddings, labels):
    X_train, X_test = tweet_embeddings[train_idx], tweet_embeddings[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)

    svm_accuracies.append(accuracy_score(y_test, y_pred))
    svm_precisions.append(precision_score(y_test, y_pred, average='weighted'))
    svm_recalls.append(recall_score(y_test, y_pred, average='weighted'))
    svm_f1s.append(f1_score(y_test, y_pred, average='weighted'))

print("SVM Results:")
print(f"Accuracy: {np.mean(svm_accuracies):.5f}")
print(f"Precision: {np.mean(svm_precisions):.5f}")
print(f"Recall: {np.mean(svm_recalls):.5f}")
print(f"F1-Score: {np.mean(svm_f1s):.5f}")



Evaluating Support Vector Machine (SVM)...
SVM Results:
Accuracy: 0.93667
Precision: 0.93684
Recall: 0.93667
F1-Score: 0.93647


In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Random Forest Evaluation
print("\nEvaluating Random Forest...")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_accuracies, rf_precisions, rf_recalls, rf_f1s = [], [], [], []

for train_idx, test_idx in cv.split(tweet_embeddings, labels):
    X_train, X_test = tweet_embeddings[train_idx], tweet_embeddings[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    rf_accuracies.append(accuracy_score(y_test, y_pred))
    rf_precisions.append(precision_score(y_test, y_pred, average='weighted'))
    rf_recalls.append(recall_score(y_test, y_pred, average='weighted'))
    rf_f1s.append(f1_score(y_test, y_pred, average='weighted'))

print("Random Forest Results:")
print(f"Accuracy: {np.mean(rf_accuracies):.5f}")
print(f"Precision: {np.mean(rf_precisions):.5f}")
print(f"Recall: {np.mean(rf_recalls):.5f}")
print(f"F1-Score: {np.mean(rf_f1s):.5f}")



Evaluating Random Forest...
Random Forest Results:
Accuracy: 0.90296
Precision: 0.90420
Recall: 0.90296
F1-Score: 0.90214
