In [1]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from sklearn.model_selection import train_test_split
import numpy as np
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)
from transformers import BitsAndBytesConfig
from typing import List, Optional, Union, Any

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1' 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

device

'cuda:0'

In [3]:
def debug_type_check(value: Any, expected_type: type, name: str):
    """
    Perform detailed type checking with informative error messages
    """
    if not isinstance(value, expected_type):
        raise TypeError(f"Expected {name} to be {expected_type}, but got {type(value)}: {value}")

class LlamaSemiSupervised:
    def __init__(
        self, 
        model_name: str = "meta-llama/Llama-3.2-1B",
        num_labels: int = 2,
        batch_size: int = 16,
        num_epochs: int = 3,
        lora_r=8,          # Reduced rank
        lora_alpha=16,     # Reduced alpha
        lora_dropout=0.1,
        learning_rate: float = 2e-4,
        max_length: int = 512
    ):
        
        print(f"Available GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9} GB")
        
        bnb_config = BitsAndBytesConfig(
            load_in_8bit=True,  # Switch to 8-bit quantization
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False
        )
        
        
        # Validate inputs
        debug_type_check(model_name, str, "model_name")
        debug_type_check(num_labels, int, "num_labels")
        debug_type_check(batch_size, int, "batch_size")
        debug_type_check(num_epochs, int, "num_epochs")
        debug_type_check(learning_rate, (int, float), "learning_rate")
        debug_type_check(max_length, int, "max_length")

        self.model_name = model_name
        self.num_labels = num_labels
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.learning_rate = learning_rate
        self.max_length = max_length
        
        # Debug print for initial configuration
        print("Initializing model with:")
        print(f"Model: {model_name}")
        print(f"Number of Labels: {num_labels}")
        
        # Initialize tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            padding_side='right',
            truncation_side='right',
            trust_remote_code=True
        )
        self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # Handle pad token
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        
        # Initialize model
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            quantization_config=bnb_config,
            device_map='auto',
            torch_dtype=torch.float16,
            max_memory={0: '10GB'},  # Explicitly limit GPU memory
            trust_remote_code=True,
            pad_token_id=self.tokenizer.pad_token_id
        )
        
        
        
        # Prepare for k-bit training
        self.model = prepare_model_for_kbit_training(self.model)
        self.model.resize_token_embeddings(len(self.tokenizer))
        
        self.batch_size = batch_size
        self.max_length = max_length
        
        # LoRA Configuration
        lora_config = LoraConfig(
            r=lora_r,
            lora_alpha=lora_alpha,
            target_modules=["q_proj", "v_proj"],
            lora_dropout=lora_dropout,
            bias="none",
            task_type=TaskType.SEQ_CLS
        )
        
        # Get PEFT model
        self.model = get_peft_model(self.model, lora_config)
        
        self.model = self.model.to(device)

    def _validate_inputs(
        self, 
        texts: List[str], 
        labels: Optional[List[int]] = None
    ):
        """
        Comprehensive input validation
        """
        # Validate texts
        debug_type_check(texts, list, "texts")
        if not all(isinstance(text, str) for text in texts):
            raise TypeError("All texts must be strings")
        
        # Validate labels if provided
        if labels is not None:
            debug_type_check(labels, list, "labels")
            if not all(isinstance(label, (int, np.integer)) for label in labels):
                raise TypeError("All labels must be integers")
            
            # Check label range
            unique_labels = set(labels)
            if len(unique_labels) > self.num_labels:
                raise ValueError(f"More unique labels ({len(unique_labels)}) than specified num_labels ({self.num_labels})")
            
            if len(texts) != len(labels):
                raise ValueError(f"Mismatch in texts ({len(texts)}) and labels ({len(labels)}) lengths")
    
    def prepare_data(
        self, 
        texts: List[str], 
        labels: Optional[List[int]] = None
    ) -> Dataset:
        """
        Prepare data with extensive validation
        """
        # Validate inputs
        self._validate_inputs(texts, labels)
        
        # Convert labels to list of integers (numpy or python int)
        if labels is not None:
            labels = [int(label) for label in labels]
        
        # Tokenize texts
        tokenized = self.tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        # Create dataset
        if labels is not None:
            print(f"Preparing dataset with {len(texts)} texts and {len(labels)} labels")
            return Dataset.from_dict({
                'input_ids': tokenized['input_ids'],
                'attention_mask': tokenized['attention_mask'],
                'labels': labels
            })
        return Dataset.from_dict({
            'input_ids': tokenized['input_ids'],
            'attention_mask': tokenized['attention_mask']
        })
    
    def train(
        self, 
        train_texts: List[str], 
        train_labels: List[int],
        val_texts: Optional[List[str]] = None, 
        val_labels: Optional[List[int]] = None
    ):
        """
        Training with comprehensive input validation
        """
        # Validate inputs
        self._validate_inputs(train_texts, train_labels)
        if val_texts is not None and val_labels is not None:
            self._validate_inputs(val_texts, val_labels)
        
        # Prepare training data
        train_dataset = self.prepare_data(train_texts, train_labels)
        
        # Prepare validation data
        if val_texts is not None and val_labels is not None:
            val_dataset = self.prepare_data(val_texts, val_labels)
        else:
            # Manual split for Hugging Face Dataset
            total_size = len(train_dataset)
            val_size = int(total_size * 0.1)
            train_size = total_size - val_size
            
            val_dataset = train_dataset.select(range(val_size))
            train_dataset = train_dataset.select(range(val_size, total_size))
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir="./results",
            num_train_epochs=self.num_epochs,
            per_device_train_batch_size=self.batch_size,
            per_device_eval_batch_size=self.batch_size,
            gradient_accumulation_steps=4,
            warmup_ratio=0.1,
            learning_rate=self.learning_rate,
            logging_dir="./logs",
            evaluation_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=2,
            load_best_model_at_end=True,
            fp16=True,
            optim="adamw_torch",
            weight_decay=0.01
        )
        
        # Initialize trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            data_collator=DataCollatorWithPadding(self.tokenizer)
        )
        
        # Train
        trainer.train()
    
    def predict(self, texts: List[str]) -> torch.Tensor:
        """
        Generate predictions with input validation
        """
        # Validate input
        self._validate_inputs(texts)
        
        # Prepare dataset
        dataset = self.prepare_data(texts)
        
        # Create trainer for prediction
        trainer = Trainer(
            model=self.model,
            tokenizer=self.tokenizer
        )
        
        # Get predictions
        predictions = trainer.predict(dataset)
        return torch.nn.functional.softmax(
            torch.tensor(predictions.predictions), 
            dim=1
        )
    
    def semi_supervised_learning(
        self,
        labelled_texts: List[str],
        labelled_labels: List[int],
        unlabelled_texts: List[str],
        confidence_threshold: float = 0.9
    ) -> torch.Tensor:
        """
        Semi-supervised learning pipeline with comprehensive validation
        """
        # Validate inputs
        self._validate_inputs(labelled_texts, labelled_labels)
        self._validate_inputs(unlabelled_texts)
        
        # Initial training
        self.train(labelled_texts, labelled_labels)
        
        # Predict on unlabelled data
        probabilities = self.predict(unlabelled_texts)
        predictions = torch.argmax(probabilities, dim=1)
        max_probs = torch.max(probabilities, dim=1)[0]
        
        # Filter high-confidence predictions
        confident_idx = max_probs >= confidence_threshold
        new_labels = predictions[confident_idx]
        new_texts = [
            unlabelled_texts[i] 
            for i in range(len(unlabelled_texts)) 
            if confident_idx[i]
        ]
        
        # Combine datasets
        all_texts = labelled_texts + new_texts
        all_labels = labelled_labels + new_labels.tolist()
        
        # Retrain
        self.train(all_texts, all_labels)
        
        # Final predictions
        final_probabilities = self.predict(unlabelled_texts)
        return torch.argmax(final_probabilities, dim=1)

In [8]:
hs_df = pd.read_csv("hs_df.csv")
hs_df.head()

Unnamed: 0,row_id,Row Number,Tweet Treated,Label
0,fffeb77ff91c618cc5482e982240f1af9f09175cddf324...,19999,russia would save a ton of money if they'd pul...,Non Hate
1,ffe7a960fccf628755ee70ab15e4fab5b45f0f436a2064...,19998,i hate grocery shopping. spent $112. damn you ...,Non Hate
2,ffdc308f7c1ceed12d8347ab9150551a9fe155023d624a...,19997,did you miss his blood and soil arguments f...,Non Hate
3,ffdbe9613a9ef0c6c484486e03422ab0bac73f62922005...,19996,"on imperialism, too complex for twitter, but ...",Non Hate
4,ffd75a28b8bf37f681e5d57d1dd1309df03aa3859a3d28...,19995,i'm still wondering why we don't #stop doing #...,Non Hate


In [9]:
hs_df_labelled = hs_df[hs_df["Label"].notnull()]
hs_df_unlabelled = hs_df[hs_df["Label"].isnull()]

hs_df_unlabelled.head()

Unnamed: 0,row_id,Row Number,Tweet Treated,Label
231,fd03af0faea32d6ba18a883aa41d496a0f77a95f0ddfd3...,19768,zelensky and putin both need this lawnmower.,
280,fc7449ecd89f35f4faa1549d7e99b49b89419f8a690f28...,19719,zelensky's talking trash. how's that for a p...,
463,fa08dfe829005b4fe02699c489a5273bc66afa827fb73e...,19536,"yeah, zelensky had good intentions. but th...",
999,f353c03c58dcb208e9e952a00c216558fb32465708997d...,19000,again i am pretty sure china never called its...,
1000,f352d8b1a9d1e5ea36d768fb4a8342784c746c5d393e74...,18999,... which sickened &gt;3000 un troops during t...,


In [10]:
hs_df_labelled["Label_bool"] = hs_df_labelled["Label"].apply(lambda x: 1 if x == "Hate" else 0)
hs_df_labelled

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hs_df_labelled["Label_bool"] = hs_df_labelled["Label"].apply(lambda x: 1 if x == "Hate" else 0)


Unnamed: 0,row_id,Row Number,Tweet Treated,Label,Label_bool
0,fffeb77ff91c618cc5482e982240f1af9f09175cddf324...,19999,russia would save a ton of money if they'd pul...,Non Hate,0
1,ffe7a960fccf628755ee70ab15e4fab5b45f0f436a2064...,19998,i hate grocery shopping. spent $112. damn you ...,Non Hate,0
2,ffdc308f7c1ceed12d8347ab9150551a9fe155023d624a...,19997,did you miss his blood and soil arguments f...,Non Hate,0
3,ffdbe9613a9ef0c6c484486e03422ab0bac73f62922005...,19996,"on imperialism, too complex for twitter, but ...",Non Hate,0
4,ffd75a28b8bf37f681e5d57d1dd1309df03aa3859a3d28...,19995,i'm still wondering why we don't #stop doing #...,Non Hate,0
...,...,...,...,...,...
994,f361154d56c09c343ebc7d0f967c492eca88a0bbcd03fe...,19005,putin gave israeli pm bennett his word he woul...,Non Hate,0
995,f35e48848e79e00d6bbbf10ca13c5e2287fcf07d6ecaf6...,19004,does anyone else ever feel like joy-crying whe...,Non Hate,0
996,f35abbe1259d13efb4031780f3226eb5ea20a385d225b2...,19003,nuremberg aint gonna actually happen until the...,Non Hate,0
997,f35aace2208a8384202e020cfdb221889de2bcb87dea7d...,19002,their gov is also more corrupt than russi...,Non Hate,0


In [11]:
labelled_texts = hs_df_labelled["Tweet Treated"].tolist()
labelled_labels = hs_df_labelled["Label_bool"].tolist()

unlabelled_texts = hs_df_unlabelled["Tweet Treated"].tolist()

In [15]:
# Initialize and run
ssl_model = LlamaSemiSupervised(num_labels=2)
predictions = ssl_model.semi_supervised_learning(
    labelled_texts,
    labelled_labels,
    unlabelled_texts
)
print("Predictions:", predictions)

Available GPU Memory: 12.878086144 GB
Initializing model with:
Model: meta-llama/Llama-3.2-1B
Number of Labels: 2


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataset with 996 texts and 996 labels


  0%|          | 0/42 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
                                               
 33%|â–ˆâ–ˆâ–ˆâ–Ž      | 14/42 [01:05<01:58,  4.22s/it]

{'eval_loss': 0.573652982711792, 'eval_runtime': 2.0488, 'eval_samples_per_second': 48.321, 'eval_steps_per_second': 3.417, 'epoch': 0.98}


  return fn(*args, **kwargs)
                                               
 67%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‹   | 28/42 [02:11<01:02,  4.47s/it]

{'eval_loss': 0.4896319508552551, 'eval_runtime': 2.4913, 'eval_samples_per_second': 39.738, 'eval_steps_per_second': 2.81, 'epoch': 1.96}


  return fn(*args, **kwargs)
                                               
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 42/42 [03:16<00:00,  4.58s/it]

{'eval_loss': 0.5027834177017212, 'eval_runtime': 2.5359, 'eval_samples_per_second': 39.04, 'eval_steps_per_second': 2.76, 'epoch': 2.95}


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 42/42 [03:16<00:00,  4.68s/it]


{'train_runtime': 196.5781, 'train_samples_per_second': 13.689, 'train_steps_per_second': 0.214, 'train_loss': 0.506005605061849, 'epoch': 2.95}


  trainer = Trainer(
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2376/2376 [20:10<00:00,  1.96it/s]


Preparing dataset with 8435 texts and 8435 labels


  return fn(*args, **kwargs)
                                                  
 33%|â–ˆâ–ˆâ–ˆâ–Ž      | 118/354 [22:21<43:10, 10.97s/it]

{'eval_loss': 0.7629305720329285, 'eval_runtime': 39.7299, 'eval_samples_per_second': 21.218, 'eval_steps_per_second': 1.334, 'epoch': 0.99}


  return fn(*args, **kwargs)
                                                   
 67%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‹   | 237/354 [44:39<21:12, 10.88s/it]

{'eval_loss': 0.7410508990287781, 'eval_runtime': 39.8264, 'eval_samples_per_second': 21.167, 'eval_steps_per_second': 1.331, 'epoch': 2.0}


  return fn(*args, **kwargs)
                                                   
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 354/354 [1:06:32<00:00, 10.91s/it]

{'eval_loss': 0.8751929998397827, 'eval_runtime': 39.7435, 'eval_samples_per_second': 21.211, 'eval_steps_per_second': 1.334, 'epoch': 2.98}


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 354/354 [1:06:32<00:00, 11.28s/it]


{'train_runtime': 3992.5218, 'train_samples_per_second': 5.705, 'train_steps_per_second': 0.089, 'train_loss': 0.008927983079252943, 'epoch': 2.98}


  trainer = Trainer(
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2376/2376 [20:39<00:00,  1.92it/s]

Predictions: tensor([0, 0, 0,  ..., 0, 0, 1])





In [16]:
import torch

def count_predictions(predictions):
    unique_elements, counts = torch.unique(predictions, return_counts=True)
    return dict(zip(unique_elements.tolist(), counts.tolist()))

# Example usage:
#predictions = torch.tensor([1, 2, 2, 3, 3, 3, 4, 4, 4, 4])
print(count_predictions(predictions))
# Output: {1: 1, 2: 2, 3: 3, 4: 4}

{0: 16459, 1: 2545}


In [17]:
predictions_list = predictions.tolist()

unlabelled_pred = pd.DataFrame({'tweet unlabeled': unlabelled_texts, 'pred': predictions_list})
unlabelled_pred

Unnamed: 0,tweet unlabeled,pred
0,zelensky and putin both need this lawnmower.,0
1,zelensky's talking trash. how's that for a p...,0
2,"yeah, zelensky had good intentions. but th...",0
3,again i am pretty sure china never called its...,0
4,... which sickened &gt;3000 un troops during t...,0
...,...,...
18999,let ukraine take care of itself. this so-...,0
19000,your daily reminder that 141 un countries vo...,0
19001,obvious to...,0
19002,who is this f*cking idiot? biden isn't a neoco...,0


In [18]:
unlabelled_pred.loc[unlabelled_pred['pred'] == 1].to_csv('unlabelled_pred.csv', index=False)

In [19]:
from collections import Counter

def count_predictions(predictions):
    return Counter(predictions)

# Example usage:
print(count_predictions(labelled_labels))
# Output: Counter({4: 4, 3: 3, 2: 2, 1: 1})

Counter({0: 790, 1: 206})
