In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip
/kaggle/input/classification-dataset-unzipped/test_labels.csv
/kaggle/input/classification-dataset-unzipped/train.csv
/kaggle/input/classification-dataset-unzipped/test.csv


In [2]:
%%writefile cfg.py
import sys
import os
from pathlib import Path
# 1. Force Legacy Mode
os.environ["TF_USE_LEGACY_KERAS"] = "1"

# 2. Import tf_keras first
import tf_keras

# 3. THE TRICK: Tell Python that 'keras' is actually 'tf_keras'
# This forces keras_tuner to use tf_keras internally.
sys.modules["keras"] = tf_keras
import tf_keras as keras 
import tensorflow as tf

class CFG:
    #current_dir = Path(__file__).resolve().parent
    BASE_PATH = '/kaggle/input/classification-dataset-unzipped'
    seed = 42  # Random seed
    preset = "distilbert-base-uncased"#"roberta-base"# "deberta_v3_extra_small_en" # Name of pretrained models
    sequence_length = 256  # Input sequence length
    epochs = 5 # Training epochs
    batch_size = 256  # Batch size
    scheduler = 'cosine'  # Learning rate scheduler
    label_cols = [
        'toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']  # Target labels
    text_col = 'comment_text'  # Input text column
    metrics = [
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.AUC(name='auc', multi_label=True)
    ]  # Evaluation metrics
    shuffle = True  # Shuffle dataset
    alpha = 0.25  # Focal loss alpha parameter
    gamma = 2.0  # Focal loss gamma parameter
    n_splits = 3  # Number of folds for cross-validation
    learning_rate = 3e-5  # Learning rate
    weight_decay = 1e-6  # Weight decay
    warmup_ratio = 0.1  # Warmup ratio for learning rate scheduler
    max_grad_norm = 1.0  # Maximum gradient norm for clipping
    dropout_rate = 0.3  # Dropout rate for regularization
    hidden_size = 256  # Hidden layer size
    dense_size = 128  # Dense layer size
    tuner_epochs = 2  # Number of epochs for hyperparameter tuning
    tuner_batch_size = 8  # Batch size for hyperparameter tuning
    tuner_trials = 5  # Number of trials for hyperparameter tuning
    tuner_executions_per_trial = 1  # Executions per trial for hyperparameter tuning
    model_dir = './model_checkpoints'  # Directory to save model checkpoints
    submission_file = './submission.csv'  # Path to save submission file
    pretrained_dir = './pretrained_models'  # Directory to save pretrained models
    log_dir = './logs'  # Directory for TensorBoard logs
    use_amp = True  # Use Automatic Mixed Precision
    device = 'cuda' if tf.config.list_physical_devices('GPU') else 'cpu'  # Device configuration
    num_workers = 4  # Number of workers for data loading
    pin_memory = True  # Pin memory for data loading
    early_stopping_patience = 3  # Early stopping patience
    early_stopping_monitor = 'val_loss'  #'val_auc'# Metric to monitor for early stopping
    early_stopping_mode = 'min'  #'max'# Mode for early stopping ('min' or 'max')
    early_stopping_restore_best_weights = True  # Restore best weights on early stopping
    random_state = 42  # Random state for reproducibility
    verbose = 1  # Verbosity level
    save_best_only = True  # Save only the best model
    save_weights_only = False  # Save the entire model, not just weights
    save_freq = 'epoch'  # Frequency to save the model
    monitor_metric = 'val_loss'  # Metric to monitor for saving the model
    n_unfreeze=3



Writing cfg.py


In [3]:
%%writefile jigsaw_classifier_updates.py
import sys
import os

# 1. Force Legacy Mode
os.environ["TF_USE_LEGACY_KERAS"] = "1"

# 2. Import tf_keras first
import tf_keras

# 3. THE TRICK: Tell Python that 'keras' is actually 'tf_keras'
# This forces keras_tuner to use tf_keras internally.
sys.modules["keras"] = tf_keras

# 4. NOW import the rest
import tensorflow as tf
import numpy as np
import keras_tuner as kt
from transformers import AutoTokenizer, TFAutoModel
# You can now refer to tf_keras simply as keras, or keep your alias
import tf_keras as keras 
from tf_keras.callbacks import EarlyStopping
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns # Optional, just makes the colors prettier
from sklearn.model_selection import KFold
import tf_keras.backend as K # To clear memory
import gc # Garbage collector
# 
#from peft import LoraConfig, get_peft_model
from tensorflow.keras import mixed_precision
from tf_keras.callbacks import ReduceLROnPlateau
# local imports
from cfg import CFG

class JigsawClassifier:
    def __init__(self, CFG=CFG):
        self.CFG = CFG
        self.train_df, self.test_combined_df, test_combined_cleaned_df = self.preprocess_input_csv()
        test_combined_cleaned_02 = test_combined_cleaned_df.sample(frac=0.2, random_state=42).reset_index(drop=True)
        self.test_df = self.test_combined_df
        
        if CFG.device == 'cpu':
            #self.test_combined_cleaned_df = test_combined_cleaned_02
            self.steps_per_epoch = 100
            self.validation_steps = 20
        else:
            self.steps_per_epoch = len(self.train_df) // self.CFG.batch_size
            self.validation_steps = len(self.test_df) // self.CFG.batch_size
            #self.steps_per_epoch = 200
            #self.validation_steps = 40
            policy = mixed_precision.Policy('mixed_float16')
            mixed_precision.set_global_policy(policy)
            print("Mixed Precision (float16) enabled.")
        self.model = self.build_model_tuner()
        self.compile_model()
        self.tokenizer = AutoTokenizer.from_pretrained(self.CFG.preset)
        self.train_ds, self.val_ds, self.test_ds = self.create_data_loader()

        # outputs
        self.y_pred = None

    def get_lr_callback(self):
        return ReduceLROnPlateau(
            monitor=self.CFG.early_stopping_monitor,
            patience=self.CFG.early_stopping_patience,
            mode=self.CFG.early_stopping_mode,            
            factor=0.5,            # Cut LR by half (multiply by 0.5)            
            min_lr=self.CFG.learning_rate,           # Don't go below this
            verbose=1              # Print message when LR changes
        )
    def create_dataset(self, df, shuffle=None):
        if shuffle is None:
            shuffle = self.CFG.shuffle
        texts = df[self.CFG.text_col]
        labels = df[self.CFG.label_cols] if self.CFG.label_cols is not None else None
        encodings = self.tokenizer(
            texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=self.CFG.sequence_length,
            return_tensors='tf'
        )
        if labels is not None:
            dataset = tf.data.Dataset.from_tensor_slices((
                {
                    'input_ids': encodings['input_ids'],
                    'attention_mask': encodings['attention_mask']
                },
                labels.values
            ))
        else:
            dataset = tf.data.Dataset.from_tensor_slices({
                'input_ids': encodings['input_ids'],
                'attention_mask': encodings['attention_mask']
            })
        dataset = dataset.cache() 
        if shuffle:
            dataset = dataset.shuffle(buffer_size=len(texts), seed=self.CFG.seed)
        dataset = dataset.batch(self.CFG.batch_size)
        dataset = dataset.prefetch(tf.data.AUTOTUNE)
        return dataset
    
    def preprocess_input_csv(self):
        if self.CFG.BASE_PATH is None:
            raise ValueError("BASE_PATH is not set in CFG.")
        train_df = pd.read_csv(f'{self.CFG.BASE_PATH}/train.csv')
        test_df = pd.read_csv(f'{self.CFG.BASE_PATH}/test.csv')
        test_labels_df = pd.read_csv(f'{self.CFG.BASE_PATH}/test_labels.csv')
        test_combined_df = pd.merge(test_df, test_labels_df, on='id')
        test_combined_cleaned_df = test_combined_df[~test_combined_df[self.CFG.label_cols].isin([-1]).any(axis=1)]
        return train_df, test_combined_df, test_combined_cleaned_df
    
    def set_trainable_params(self, base_model):
        """
        Calculates and prints the number of trainable parameters.
        """
        # ============================================================
        # CRITICAL FIX: 
        # The Top-Level Model MUST be trainable. 
        # We will manually freeze the sub-components we don't want.
        # ============================================================
        base_model.trainable = True 
        
        # 1. LOCATE SUB-COMPONENTS
        # We need to find two things: 
        #   A. The Embeddings (Always Freeze)
        #   B. The Encoder Layers (Selectively Freeze)
        
        if hasattr(base_model, 'distilbert'):
            embeddings = base_model.distilbert.embeddings
            transformer_layers = base_model.distilbert.transformer.layer
            model_type = "DistilBERT"
        elif hasattr(base_model, 'roberta'):
            embeddings = base_model.roberta.embeddings
            transformer_layers = base_model.roberta.encoder.layer
            model_type = "RoBERTa"
        elif hasattr(base_model, 'bert'):
            embeddings = base_model.bert.embeddings
            transformer_layers = base_model.bert.encoder.layer
            model_type = "BERT"
        else:
            # Generic Fallback (Might miss embeddings, but safer than crashing)
            # Usually layer[0] is the main trunk
            embeddings = None 
            transformer_layers = base_model.layers[0].encoder.layer
            model_type = "Unknown"

        # 2. ALWAYS FREEZE EMBEDDINGS
        # (The dictionary of words should not change)
        if embeddings is not None:
            embeddings.trainable = False

        # 3. SELECTIVELY FREEZE LAYERS
        n_to_unfreeze = self.CFG.n_unfreeze
        total_layers = len(transformer_layers)
        cutoff = total_layers - n_to_unfreeze

        print(f"--- Optimizing {model_type} ---")
        print(f"Freezing Embeddings: Yes")
        print(f"Freezing Bottom {cutoff} Layers")
        print(f"Unfreezing Top {n_to_unfreeze} Layers")

        for i, layer in enumerate(transformer_layers):
            if i < cutoff:
                layer.trainable = False # Freeze Bottom
            else:
                layer.trainable = True  # Train Top

        # 4. CALCULATE STATS
        # We force Keras to re-evaluate the weights by accessing .trainable_weights
        trainable_count = np.sum([K.count_params(w) for w in base_model.trainable_weights])
        non_trainable_count = np.sum([K.count_params(w) for w in base_model.non_trainable_weights])
        total_count = trainable_count + non_trainable_count
        
        if total_count == 0:
            percentage = 0
        else:
            percentage = (trainable_count / total_count) * 100
        
        print("\n" + "="*40)
        print(f"base_model PARAMETER STATS")
        print("="*40)
        print(f"Total Params:        {total_count:,.0f}")
        print(f"Trainable Params:    {trainable_count:,.0f}")
        print(f"Non-Trainable Params:{non_trainable_count:,.0f}")
        print(f"Trainable Percentage: {percentage:.2f}%")
        print("="*40 + "\n")
        
        return base_model
        
    def build_model_tuner(self,hp=None):
        if hp is None:
            dropout_rate = self.CFG.dropout_rate
            learning_rate = self.CFG.learning_rate
        else:
            dropout_rate = hp.Float('dropout_rate', min_value=0.1, max_value=0.5, step=0.1)
            learning_rate = hp.Choice('learning_rate', values=[1e-3, 1e-4, 1e-5])
        
        # Using 'keras' here (which is actually tf_keras now)
        input_ids = keras.layers.Input(shape=(self.CFG.sequence_length,), dtype=tf.int32, name='input_ids')
        attention_mask = keras.layers.Input(shape=(self.CFG.sequence_length,), dtype=tf.int32, name='attention_mask')
        
        base_model = TFAutoModel.from_pretrained(self.CFG.preset)
        #lora_config = LoraConfig(
        #    r=8,                    # Rank 8 is sufficient for Classification
        #    lora_alpha=16,          # Alpha should be 2x Rank
        #    target_modules=["query", "value"], # These are the attention matrices in BERT/RoBERTa
        #    lora_dropout=0.1,
        #    bias="none",
        #)
        #base_model = get_peft_model(base_model, lora_config)
        #print("\n--- LoRA Activated ---")
        
        base_model = self.set_trainable_params(base_model)
        
        output = base_model(input_ids, attention_mask=attention_mask)
        pooled_output = output.last_hidden_state[:, 0, :]
        
        dropout = keras.layers.Dropout(dropout_rate, name='dropout')(pooled_output)
        output = keras.layers.Dense(len(self.CFG.label_cols), 
                                    activation='sigmoid',
                                    name='sigmoid_output',
                                    dtype='float32',)(dropout)
        
        model = keras.Model(inputs=[input_ids, attention_mask], outputs=output)
        return model
    def compile_model(self, hp=None):
        if hp is None:
            learning_rate = self.CFG.learning_rate
        else:
            learning_rate = hp.Choice('learning_rate', values=[1e-3, 1e-4, 1e-5])
        
         # Using 'keras' here (which is actually tf_keras now)
        self.model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
            #loss='binary_crossentropy',
            loss=self.focull_loss,
            metrics=self.CFG.metrics,
            jit_compile=True
        )
        
    def early_stopping_callback(self):
        # Using 'keras' here (which is actually tf_keras now)
        return EarlyStopping(
            monitor=self.CFG.early_stopping_monitor,
            patience=self.CFG.early_stopping_patience,
            mode=self.CFG.early_stopping_mode,
            restore_best_weights=self.CFG.early_stopping_restore_best_weights
        )
    def get_tuner(self):
        tuner = kt.RandomSearch(
            hypermodel=self.build_model_tuner,
            objective=kt.Objective("val_loss", direction="min"),
            max_trials=2,
            executions_per_trial=1,
            overwrite=True,
            directory='kt_tuner_dir',
            project_name='jigsaw_classifier_tuning'
        )

        print("Search space summary:")
        tuner.search_space_summary()
        return tuner
    
    def create_data_loader(self):
        train_split_df, val_split_df = train_test_split(
            self.train_df,
            test_size=0.2,
            random_state=self.CFG.seed,
        )
        val_split_ds = self.create_dataset(
            val_split_df, shuffle=self.CFG.shuffle
        )
        train_split_ds = self.create_dataset(
            train_split_df, shuffle=self.CFG.shuffle)
        
        test_ds = self.create_dataset(
            self.test_df, shuffle=False    
        )
        return train_split_ds, val_split_ds, test_ds
    
    def train_model(self):
        early_stopping = self.early_stopping_callback()

        history = self.model.fit(
            self.train_ds,
            validation_data=self.val_ds,
            epochs=self.CFG.epochs,
            callbacks=[early_stopping],
            steps_per_epoch=self.steps_per_epoch,   
            validation_steps=self.validation_steps  
        )
        return history
    def predict(self):
        self.y_pred  = self.model.predict(self.test_ds)
        return self.y_pred
    def focull_loss(self, y_true, y_pred):
        alpha = self.CFG.alpha 
        gamma = self.CFG.gamma
        
        # Cast to float32 to avoid type errors
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.cast(y_pred, tf.float32)
        
        y_pred = tf.clip_by_value(y_pred, tf.keras.backend.epsilon(), 1. - tf.keras.backend.epsilon())
        
        # Masking logic
        # If y_true is 0, set p_t1 to 1 so log(1)=0 (cancels out)
        p_t1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        
        # If y_true is 1, set p_t0 to 0 so log(1-0)=0 (cancels out)
        p_t0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        
        # --- CHANGES START HERE ---
        
        # REMOVE tf.reduce_sum. Keep the shape as (batch_size, labels)
        p = - (alpha * tf.pow(1. - p_t1, gamma) * tf.math.log(p_t1))
        n = - ((1. - alpha) * tf.pow(p_t0, gamma) * tf.math.log(1. - p_t0))
        
        # NOW you can use mean. 
        # It adds p+n (element-wise) and then averages over the batch.
        return tf.reduce_mean(p + n)
    
    def k_fold_model_training(self):
        # Implement k-fold cross-validation training here
        kf = KFold(n_splits=self.CFG.n_splits, shuffle=True, random_state=self.CFG.random_state)
        for fold, (train_index, val_index) in enumerate(kf.split(self.train_df)):
            K.clear_session()
            gc.collect()
            train_fold_df = self.train_df.iloc[train_index]
            val_fold_df = self.train_df.iloc[val_index]
            self.train_ds = self.create_dataset(train_fold_df, shuffle=self.CFG.shuffle)
            self.val_ds = self.create_dataset(val_fold_df, shuffle=self.CFG.shuffle)
            self.model = self.build_model_tuner()
            self.compile_model()
            early_stopping = self.early_stopping_callback()
            self.steps_per_epoch = len(train_fold_df) // self.CFG.batch_size
            self.validation_steps = len(val_fold_df) // self.CFG.batch_size
            self.model.fit(
                self.train_ds,
                validation_data=self.val_ds,
                epochs=self.CFG.epochs,
                callbacks=[early_stopping],
                #steps_per_epoch=self.steps_per_epoch,
                #validation_steps=self.validation_steps
            )
            # After training on this fold, make predictions on the test set
            fold_y_pred = self.model.predict(self.test_ds)
            if self.y_pred is None:
                self.y_pred = fold_y_pred / self.CFG.n_splits
            else:
                self.y_pred += fold_y_pred / self.CFG.n_splits
            # --- Cleanup to save RAM ---
            del self.model, self.train_ds, self.val_ds
        y_pred_df = pd.DataFrame(self.y_pred, columns=self.CFG.label_cols)
        # insert the 'id' column from test_combined_cleaned dataframe into y_pred_df at the beginning
        y_pred_df.insert(0, 'id', self.test_combined_df['id'].values)
        y_pred_df.to_csv('submission.csv', index=False)
            
        
    def plot_metrics(self ):        
        # Set up the plot
        plt.figure(figsize=(10, 8))
        colors = sns.color_palette("bright", n_colors=len(CFG.label_cols))
        lw = 2 # Line width
        test_combined_cleaned = self.test_combined_df[~self.test_combined_df[CFG.label_cols].isin([-1]).any(axis=1)]
        y_pred_cleaned = self.y_pred[~self.test_combined_df[CFG.label_cols].isin([-1]).any(axis=1)]

        # Loop through each label (Toxic, Severe_Toxic, etc.)
        for i, label in enumerate(CFG.label_cols):
            # 1. Compute FPR and TPR for this specific label
            
    
            fpr, tpr, thresholds  = roc_curve(test_combined_cleaned[label].values, y_pred_cleaned[:, i])
            J = tpr - fpr
            ix = np.argmax(J) # Index of the maximum J
            best_thresh = thresholds[ix]
            best_fpr = fpr[ix]
            best_tpr = tpr[ix]
            max_j = J[ix]
            print(f"{label:<15}: Best Thresh={best_thresh:.3f}, Max J={max_j:.3f}")
            # 2. Calculate the AUC score for this specific label
            roc_auc = auc(fpr, tpr)
            plt.scatter(best_fpr, best_tpr, color=colors[i], s=70, edgecolor='black', zorder=5)
            offset_y = -20 - (i * 12)
            # 3. Plot the curve
            plt.plot(fpr, tpr, color=colors[i], lw=lw,
                    label=f'{label} (area = {roc_auc:.2f})')
            plt.annotate(f'Th={best_thresh:.2f}', 
                        xy=(best_fpr, best_tpr), 
                        xytext=(20, offset_y), # Offset text to the right and down
                        textcoords='offset points',
                        fontsize=9, 
                        arrowprops=dict(arrowstyle="->", color='gray', alpha=0.5),
                        color=colors[i],
                        fontweight='bold')

        # Plot the "Random Guess" line (diagonal)
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')

        # Formatting
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate (FPR)')
        plt.ylabel('True Positive Rate (TPR)')
        plt.title('ROC Curves by Toxicity Type')
        plt.legend(loc="lower right")
        plt.grid(True, alpha=0.3)

        plt.show()



Writing jigsaw_classifier_updates.py


In [None]:
from jigsaw_classifier_updates import JigsawClassifier
from cfg import CFG

#if __name__ == "__main__":
classifier = JigsawClassifier(CFG)
classifier.k_fold_model_training()
    #y_pred = classifier.predict()
classifier.plot_metrics()

2025-11-23 11:58:54.903836: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763899135.092745      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763899135.149310      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

I0000 00:00:1763899160.642176      48 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Your GPU may run slowly with dtype policy mixed_float16 because it does not have compute capability of at least 7.0. Your GPU:
  Tesla P100-PCIE-16GB, compute capability 6.0
See https://developer.nvidia.com/cuda-gpus for a list of GPUs and their compute capabilities.
Mixed Precision (float16) enabled.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was tr

--- Optimizing DistilBERT ---
Freezing Embeddings: Yes
Freezing Bottom 3 Layers
Unfreezing Top 3 Layers

base_model PARAMETER STATS
Total Params:        66,362,880
Trainable Params:    21,263,616
Non-Trainable Params:45,099,264
Trainable Percentage: 32.04%



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was tr

--- Optimizing DistilBERT ---
Freezing Embeddings: Yes
Freezing Bottom 3 Layers
Unfreezing Top 3 Layers

base_model PARAMETER STATS
Total Params:        66,362,880
Trainable Params:    21,263,616
Non-Trainable Params:45,099,264
Trainable Percentage: 32.04%

Epoch 1/5


I0000 00:00:1763899264.232334     105 service.cc:148] XLA service 0x7a60bc0019d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1763899264.233365     105 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
W0000 00:00:1763899264.655965     105 assert_op.cc:38] Ignoring Assert operator model/tf_distil_bert_model/distilbert/embeddings/assert_less/Assert/Assert
I0000 00:00:1763899266.671875     105 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1763899285.082477     105 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




W0000 00:00:1763899944.203587     103 assert_op.cc:38] Ignoring Assert operator model/tf_distil_bert_model/distilbert/embeddings/assert_less/Assert/Assert




W0000 00:00:1763899963.418970     104 assert_op.cc:38] Ignoring Assert operator model/tf_distil_bert_model/distilbert/embeddings/assert_less/Assert/Assert
W0000 00:00:1763900123.138103     105 assert_op.cc:38] Ignoring Assert operator model/tf_distil_bert_model/distilbert/embeddings/assert_less/Assert/Assert


Epoch 2/5
Epoch 3/5
Epoch 4/5


W0000 00:00:1763903400.809812     104 assert_op.cc:38] Ignoring Assert operator model/tf_distil_bert_model/distilbert/embeddings/assert_less/Assert/Assert




W0000 00:00:1763903858.721528     105 assert_op.cc:38] Ignoring Assert operator model/tf_distil_bert_model/distilbert/embeddings/assert_less/Assert/Assert




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


--- Optimizing DistilBERT ---
Freezing Embeddings: Yes
Freezing Bottom 3 Layers
Unfreezing Top 3 Layers

base_model PARAMETER STATS
Total Params:        66,362,880
Trainable Params:    21,263,616
Non-Trainable Params:45,099,264
Trainable Percentage: 32.04%

Epoch 1/5


W0000 00:00:1763903902.579210     103 assert_op.cc:38] Ignoring Assert operator model/tf_distil_bert_model/distilbert/embeddings/assert_less/Assert/Assert




W0000 00:00:1763904569.907607     104 assert_op.cc:38] Ignoring Assert operator model/tf_distil_bert_model/distilbert/embeddings/assert_less/Assert/Assert




W0000 00:00:1763904588.506658     104 assert_op.cc:38] Ignoring Assert operator model/tf_distil_bert_model/distilbert/embeddings/assert_less/Assert/Assert
W0000 00:00:1763904748.101808     105 assert_op.cc:38] Ignoring Assert operator model/tf_distil_bert_model/distilbert/embeddings/assert_less/Assert/Assert


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


W0000 00:00:1763908022.212594     102 assert_op.cc:38] Ignoring Assert operator model/tf_distil_bert_model/distilbert/embeddings/assert_less/Assert/Assert




W0000 00:00:1763908479.799685     105 assert_op.cc:38] Ignoring Assert operator model/tf_distil_bert_model/distilbert/embeddings/assert_less/Assert/Assert




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


--- Optimizing DistilBERT ---
Freezing Embeddings: Yes
Freezing Bottom 3 Layers
Unfreezing Top 3 Layers

base_model PARAMETER STATS
Total Params:        66,362,880
Trainable Params:    21,263,616
Non-Trainable Params:45,099,264
Trainable Percentage: 32.04%

Epoch 1/5


W0000 00:00:1763908520.154585     104 assert_op.cc:38] Ignoring Assert operator model/tf_distil_bert_model/distilbert/embeddings/assert_less/Assert/Assert




W0000 00:00:1763909186.793541     103 assert_op.cc:38] Ignoring Assert operator model/tf_distil_bert_model/distilbert/embeddings/assert_less/Assert/Assert




W0000 00:00:1763909199.109244     104 assert_op.cc:38] Ignoring Assert operator model/tf_distil_bert_model/distilbert/embeddings/assert_less/Assert/Assert
W0000 00:00:1763909358.463862     102 assert_op.cc:38] Ignoring Assert operator model/tf_distil_bert_model/distilbert/embeddings/assert_less/Assert/Assert


Epoch 2/5