In [1]:
!pip install -U sentence-transformers
!pip install sacremoses
!pip install torch

[0m

In [2]:
import os
import logging
import os
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer
import torch

2024-12-17 09:51:55.834377: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-17 09:51:55.855807: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-17 09:51:55.875116: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-17 09:51:55.882636: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-17 09:51:55.895389: I tensorflow/core/platform/cpu_feature_guar

In [5]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("pipeline.log"),
        logging.StreamHandler()
    ]
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {device}")

# Function to compute word embeddings in batches
def get_word_embeddings_batch(texts, model, tokenizer, batch_size=32):
    embeddings = []
    logging.info(f"Computing word embeddings in batches (batch size: {batch_size})...")
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        tokens = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True).to(device)
        outputs = model(**tokens.to(device))  # Ensure tokens are moved to the same device
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()
        embeddings.extend(batch_embeddings)
        logging.info(f"Generated embeddings shape: {embeddings.shape}")
    return embeddings

# Function to compute sentence/message embeddings in batches
def get_message_embeddings_batch(texts, model_name, batch_size=32):
    model = SentenceTransformer(model_name).to(device)
    embeddings = []
    logging.info(f"Computing sentence embeddings in batches (batch size: {batch_size})...")
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = model.encode(batch_texts, show_progress_bar=True, device=str(device))  # Specify device
        embeddings.extend(batch_embeddings)
        logging.info(f"Generated embeddings shape: {embeddings.shape}")
    return embeddings

# Function to process DataFrame with embeddings
def process_dataframe(df, models, batch_size=32):
    """
    Process DataFrame to compute embeddings for 'TEXT' column.

    Args:
        df (pd.DataFrame): Input DataFrame.
        models (dict): Dictionary containing model and tokenizer pairs.
        batch_size (int): Batch size for processing.

    Returns:
        pd.DataFrame: DataFrame with additional embedding columns.
    """
    if "TEXT" not in df.columns:
        logging.error("The DataFrame does not contain a 'TEXT' column.")
        raise ValueError("The DataFrame does not contain a 'TEXT' column.")

    df["TEXT"] = df["TEXT"].fillna('').astype(str)
    texts = df["TEXT"].tolist()

    for model_name, model_info in models.items():
        model, tokenizer, message_model_name = model_info

        # Word embeddings
        logging.info(f"Processing word embeddings for model: {model_name}")
        word_embeddings = get_word_embeddings_batch(texts, model, tokenizer, batch_size)
        df[f"{model_name}_Word_Embeddings"] = word_embeddings

        # Message embeddings
        logging.info(f"Processing message embeddings for model: {model_name}")
        message_embeddings = get_message_embeddings_batch(texts, message_model_name, batch_size)
        df[f"{model_name}_Message_Embeddings"] = message_embeddings

    return df

# Function to prepare data
def prepare_data(file_path, embedding_conversion):
    if not os.path.exists(file_path):
        logging.error(f"File not found: {file_path}")
        return None

    logging.info(f"Processing file: {file_path}")
    data = pd.read_csv(file_path)

    if embedding_conversion:
        models = {
            "CamemBERT": (
                AutoModel.from_pretrained('camembert/camembert-large').to(device),
                AutoTokenizer.from_pretrained('camembert/camembert-large'),
                "camembert/camembert-large"
                ),
            "Flaubert": (
                AutoModel.from_pretrained('flaubert/flaubert_large_cased').to(device),
                AutoTokenizer.from_pretrained('flaubert/flaubert_large_cased'),
                "flaubert/flaubert_large_cased"
                ),
            "mBERT": (
                AutoModel.from_pretrained('bert-base-multilingual-cased').to(device),
                AutoTokenizer.from_pretrained('bert-base-multilingual-cased'),
                "camembert/camembert-large"
                ),
            "CamemBERTa": (
                AutoModel.from_pretrained('almanach/camembertav2-base').to(device),
                AutoTokenizer.from_pretrained('almanach/camembertav2-base'),
                "almanach/camembertav2-base"
                )
        }

        data = process_dataframe(data, models, batch_size=8)

    return data

2024-12-17 09:53:31,866 - INFO - Using device: cuda


In [6]:
# Main execution

tasks = ['ABUSE']
embedding_conversion = True

for task in tasks:
    try:
        file_path = "Data/majority_vote/sample.csv"
        report_path = f"reports/{task}_report.txt"
        #confusion_matrix_path = f"reports/{task}_confusion_matrix.csv" as to be a plot
        fold_sizes_path = f"reports/{task}_fold_sizes.csv"
        
        logging.info(f"Processing task: {task} at {file_path}")
        data = prepare_data(file_path, embedding_conversion)
        
        # # Initialize embeddings if required
        # if embedding_conversion:
        #     # Models and tokenizers
        #     models = {
        #         "CamemBERT": (
        #             CamembertModel.from_pretrained('camembert/camembert-large').to(device),
        #             CamembertTokenizer.from_pretrained('camembert/camembert-large'),
        #             "camembert/camembert-large"
        #         ),
        #         "Flaubert": (
        #             FlaubertModel.from_pretrained('flaubert/flaubert_large_cased').to(device),
        #             FlaubertTokenizer.from_pretrained('flaubert/flaubert_large_cased'),
        #             "flaubert/flaubert_large_cased"
        #         )
        #     }

        #     # Process DataFrame
        #     data = process_dataframe(data, models, batch_size=8)

        # # Save the processed data
        # output_path = os.path.join(file_path, 'processed_data.csv')
        data.to_csv(file_path, index=False)
        logging.info(f"Processed data saved to {file_path}")

    except Exception as e:
        logging.error(f"Error processing task {task}: {e}")

2024-12-17 09:53:33,520 - INFO - Processing task: ABUSE at Data/majority_vote/sample.csv
2024-12-17 09:53:33,523 - INFO - Processing file: Data/majority_vote/sample.csv
Some weights of CamembertModel were not initialized from the model checkpoint at camembert/camembert-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2024-12-17 09:53:41,704 - ERROR - Error processing task ABUSE: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 3.71 GiB of which 8.06 MiB is free. Including non-PyTorch memory, this process has 3.68 GiB memory in use. Of the allocated memory 3.49 GiB is allocated by PyTorch, and 130.41 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Mana

In [None]:
import os
import ast
import logging
import torch
import pandas as pd
import numpy as np
from collections import Counter
from transformers import (
    CamembertModel, CamembertTokenizer, AutoTokenizer, AutoModel
)
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import joblib
from sentence_transformers import SentenceTransformer

# Logging configuration
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("pipeline.log"),
        logging.StreamHandler()
    ]
)

# Global Parameters
BATCH_SIZE = 8
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {DEVICE}")

# Function to compute sentence embeddings in batches
def get_message_embeddings_batch(texts, model_name, batch_size=BATCH_SIZE):
    embeddings = []

    try:
        # Try to load as a SentenceTransformer model
        model = SentenceTransformer(model_name).to(DEVICE)
        tokenizer = None  # No need for separate tokenizer
        use_transformers_directly = False
    except Exception as e:
        # Fall back to transformers if SentenceTransformer fails
        logging.warning(f"No SentenceTransformer model found with name '{model_name}'. Falling back to transformers: {e}")
        model = AutoModel.from_pretrained(model_name).to(DEVICE)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        use_transformers_directly = True

    logging.info(f"Computing embeddings with model '{model_name}' in batches...")

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        if not use_transformers_directly:
            # Use SentenceTransformer for encoding
            batch_embeddings = model.encode(batch_texts, show_progress_bar=True, device=str(DEVICE))
        else:
            # Use AutoModel and AutoTokenizer with word-level averaging
            batch_embeddings = []
            for text in batch_texts:
                batch_embeddings.append(get_word_embeddings(text, model, tokenizer))

        embeddings.extend(batch_embeddings)

    # Convert list of embeddings to a numpy array
    embeddings_array = np.vstack(embeddings)
    logging.info(f"Generated embeddings shape: {embeddings_array.shape}")
    return embeddings_array


# Function to compute word-level averaged embeddings
def get_word_embeddings(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(DEVICE)

    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_state = outputs.last_hidden_state

        # Word-level averaging
        word_embeddings = last_hidden_state.squeeze()
        message_embedding_word_avg = word_embeddings.mean(dim=0)

    # Move to CPU and convert to numpy
    return message_embedding_word_avg.cpu().numpy()

# Function to save results and print to file
def print_and_save(message, file):
    print(message, file=file)
    logging.info(message)

# Save confusion matrix to CSV
def save_confusion_matrix(y_true, y_pred, output_path):
    cm = confusion_matrix(y_true, y_pred)
    cm_df = pd.DataFrame(cm, index=["Class 0", "Class 1"], columns=["Class 0", "Class 1"])
    cm_df.to_csv(output_path, index=True)
    logging.info(f"Confusion matrix saved to {output_path}")

# Save StratifiedKFold split sizes
def save_fold_sizes(splits, output_path):
    fold_data = [{"Fold": i + 1, "Train Size": len(train_idx), "Test Size": len(test_idx)} for i, (train_idx, test_idx) in enumerate(splits)]
    pd.DataFrame(fold_data).to_csv(output_path, index=False)
    logging.info(f"StratifiedKFold split sizes saved to {output_path}")

# F1Evaluator Class for Metrics Calculation
class F1Evaluator:
    @staticmethod
    def evaluate(classifier):
        class_counts = Counter(classifier.Y_true)
        total = len(classifier.Y_true)
        proportions = {cls: round((count / total) * 100, 2) for cls, count in class_counts.items()}
        metrics = {
            "micro": f1_score(classifier.Y_true, classifier.predictions, average="micro"),
            "macro": f1_score(classifier.Y_true, classifier.predictions, average="macro"),
            "weighted": f1_score(classifier.Y_true, classifier.predictions, average="weighted")
        }
        return f"Metrics:\nProportions: {proportions}\nF1 Scores: {metrics}"

# Classifier Wrapper
class Classifier:
    def __init__(self, model, param_grid, model_name):
        self.model = model
        self.param_grid = param_grid
        self.model_name = model_name
        self.best_model = None
        self.best_params = None
        self.Y_true = None
        self.predictions = None

    def perform_grid_search(self, X_train, y_train, scaler, n_iter=50, random_state=42):
        X_train_scaled = scaler.fit_transform(X_train)
        inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
        random_search = RandomizedSearchCV(
            self.model, self.param_grid, cv=inner_cv, scoring="f1_weighted", n_iter=n_iter, random_state=random_state, n_jobs=-1
        )
        random_search.fit(X_train_scaled, y_train)
        self.best_model = random_search.best_estimator_
        self.best_params = random_search.best_params_

    def evaluate_model(self, X_test, y_test, scaler):
        X_test_scaled = scaler.transform(X_test)
        self.predictions = self.best_model.predict(X_test_scaled)
        self.Y_true = y_test

# Function to prepare data
def prepare_data(file_path, embedding_conversion):
    if not os.path.exists(file_path):
        logging.error(f"File not found: {file_path}")
        return None

    logging.info(f"Processing file: {file_path}")
    data = pd.read_csv(file_path)

    if embedding_conversion:
        models = {
            "CamemBERT": ("camembert/camembert-large"),
            "CamemBERTa": ("almanach/camembertav2-base"),
            "mBERT": ("bert-base-multilingual-cased")
        }

        for model_name, model_path in models.items():
            data[f"{model_name}_Message_Embeddings"] = get_message_embeddings_batch(data["TEXT"].fillna("").tolist(), model_path)

    return data

# Function to safely parse embedding strings into numpy arrays
def parse_embedding(value):
    try:
        # Remove ellipses and ensure proper formatting
        if isinstance(value, str):
            cleaned_value = value.replace("...", "").replace("\n", "").strip()
            # Convert to numpy array directly using np.fromstring if it's a space-separated string
            return np.fromstring(cleaned_value.strip("[]"), sep=" ")
        else:
            logging.error(f"Non-string value encountered: {value}")
            return np.array([])
    except Exception as e:
        logging.error(f"Failed to parse embedding: {value} - {e}")
        return np.array([])  # Return an empty array on failure

# Main Execution
tasks = ["ABUSE", "B_POLARITY", "POR"]
embedding_conversion = True

for task in tasks:
    try:
        file_path = "Data/majority_vote/annotations_all_binary.csv"
        report_path = f"reports/{task}_report.txt"
        confusion_matrix_path = f"reports/{task}_confusion_matrix.csv"
        fold_sizes_path = f"reports/{task}_fold_sizes.csv"

        data = prepare_data(file_path, embedding_conversion)

        if data is None:
            continue

        with open(report_path, "w") as report_file:
            for embedding in [col for col in data.columns if "CamemBERT_Message_Embeddings" in col]:
                print(np.array(data[embedding][0]).tolist())
                features = np.array(data[embedding].apply(parse_embedding).tolist())
                labels = data[task].dropna().astype(int).to_numpy()

                param_grids = {
                    "SVM": {'C': [0.1, 1], 'kernel': ['linear', 'rbf']},
                    "Decision Tree": {'max_depth': [3, 5], 'criterion': ['gini', 'entropy']},
                    "Random Forest": {'n_estimators': [10, 50], 'max_depth': [5, 10]},
                    "Logistic Regression": {'C': [0.1, 1], 'penalty': ['l2']}
                }

                classifiers = [
                    Classifier(SVC(probability=True), param_grids["SVM"], "SVM"),
                    Classifier(DecisionTreeClassifier(), param_grids["Decision Tree"], "Decision Tree"),
                    Classifier(RandomForestClassifier(), param_grids["Random Forest"], "Random Forest"),
                    Classifier(LogisticRegression(), param_grids["Logistic Regression"], "Logistic Regression")
                ]

                outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
                splits = list(outer_cv.split(features, labels))
                save_fold_sizes(splits, fold_sizes_path)

                for clf in classifiers:
                    clf.perform_grid_search(features, labels, StandardScaler())
                    clf.evaluate_model(features, labels, StandardScaler())
                    evaluator = F1Evaluator()
                    metrics = evaluator.evaluate(clf)
                    print_and_save(metrics, report_file)

                    # Save confusion matrix of the best model
                    save_confusion_matrix(clf.Y_true, clf.predictions, confusion_matrix_path)

    except Exception as e:
        logging.error(f"Error processing task '{task}': {e}")