In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1' 
#os.environ['CUDA_HOME'] = 'C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.8'

import pandas as pd
#import torch
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from sklearn.model_selection import train_test_split
import numpy as np
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)
from transformers import BitsAndBytesConfig
from typing import List, Optional, Union, Any



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

device

'cuda:0'

In [3]:
def debug_type_check(value: Any, expected_type: type, name: str):
    """
    Perform detailed type checking with informative error messages
    """
    if not isinstance(value, expected_type):
        raise TypeError(f"Expected {name} to be {expected_type}, but got {type(value)}: {value}")

class LlamaSemiSupervised:
    def __init__(
        self, 
        model_name: str = "meta-llama/Llama-3.2-1B",
        num_labels: int = 2,
        batch_size: int = 16,
        num_epochs: int = 3,
        lora_r=8,          # Reduced rank
        lora_alpha=16,     # Reduced alpha
        lora_dropout=0.1,
        learning_rate: float = 2e-4,
        max_length: int = 512
    ):
        
        print(f"Available GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9} GB")
        
        bnb_config = BitsAndBytesConfig(
            load_in_8bit=True,  # Switch to 8-bit quantization
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False
        )
        
        
        # Validate inputs
        debug_type_check(model_name, str, "model_name")
        debug_type_check(num_labels, int, "num_labels")
        debug_type_check(batch_size, int, "batch_size")
        debug_type_check(num_epochs, int, "num_epochs")
        debug_type_check(learning_rate, (int, float), "learning_rate")
        debug_type_check(max_length, int, "max_length")

        self.model_name = model_name
        self.num_labels = num_labels
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.learning_rate = learning_rate
        self.max_length = max_length
        
        # Debug print for initial configuration
        print("Initializing model with:")
        print(f"Model: {model_name}")
        print(f"Number of Labels: {num_labels}")
        
        # Initialize tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            padding_side='right',
            truncation_side='right',
            trust_remote_code=True
        )
        self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # Handle pad token
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        
        # Initialize model
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            quantization_config=bnb_config,
            device_map='auto',
            torch_dtype=torch.float16,
            max_memory={0: '10GB'},  # Explicitly limit GPU memory
            trust_remote_code=True,
            pad_token_id=self.tokenizer.pad_token_id
        )
        
        
        
        # Prepare for k-bit training
        self.model = prepare_model_for_kbit_training(self.model)
        self.model.resize_token_embeddings(len(self.tokenizer))
        
        self.batch_size = batch_size
        self.max_length = max_length
        
        # LoRA Configuration
        lora_config = LoraConfig(
            r=lora_r,
            lora_alpha=lora_alpha,
            target_modules=["q_proj", "v_proj"],
            lora_dropout=lora_dropout,
            bias="none",
            task_type=TaskType.SEQ_CLS
        )
        
        # Get PEFT model
        self.model = get_peft_model(self.model, lora_config)
        
        self.model = self.model.to(device)

    def _validate_inputs(
        self, 
        texts: List[str], 
        labels: Optional[List[int]] = None
    ):
        """
        Comprehensive input validation
        """
        # Validate texts
        debug_type_check(texts, list, "texts")
        if not all(isinstance(text, str) for text in texts):
            raise TypeError("All texts must be strings")
        
        # Validate labels if provided
        if labels is not None:
            debug_type_check(labels, list, "labels")
            if not all(isinstance(label, (int, np.integer)) for label in labels):
                raise TypeError("All labels must be integers")
            
            # Check label range
            unique_labels = set(labels)
            if len(unique_labels) > self.num_labels:
                raise ValueError(f"More unique labels ({len(unique_labels)}) than specified num_labels ({self.num_labels})")
            
            if len(texts) != len(labels):
                raise ValueError(f"Mismatch in texts ({len(texts)}) and labels ({len(labels)}) lengths")
    
    def prepare_data(
        self, 
        texts: List[str], 
        labels: Optional[List[int]] = None
    ) -> Dataset:
        """
        Prepare data with extensive validation
        """
        # Validate inputs
        self._validate_inputs(texts, labels)
        
        # Convert labels to list of integers (numpy or python int)
        if labels is not None:
            labels = [int(label) for label in labels]
        
        # Tokenize texts
        tokenized = self.tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        # Create dataset
        if labels is not None:
            print(f"Preparing dataset with {len(texts)} texts and {len(labels)} labels")
            return Dataset.from_dict({
                'input_ids': tokenized['input_ids'],
                'attention_mask': tokenized['attention_mask'],
                'labels': labels
            })
        return Dataset.from_dict({
            'input_ids': tokenized['input_ids'],
            'attention_mask': tokenized['attention_mask']
        })
    
    def train(
        self, 
        train_texts: List[str], 
        train_labels: List[int],
        val_texts: Optional[List[str]] = None, 
        val_labels: Optional[List[int]] = None
    ):
        """
        Training with comprehensive input validation
        """
        # Validate inputs
        self._validate_inputs(train_texts, train_labels)
        if val_texts is not None and val_labels is not None:
            self._validate_inputs(val_texts, val_labels)
        
        # Prepare training data
        train_dataset = self.prepare_data(train_texts, train_labels)
        
        # Prepare validation data
        if val_texts is not None and val_labels is not None:
            val_dataset = self.prepare_data(val_texts, val_labels)
        else:
            # Manual split for Hugging Face Dataset
            total_size = len(train_dataset)
            val_size = int(total_size * 0.1)
            train_size = total_size - val_size
            
            val_dataset = train_dataset.select(range(val_size))
            train_dataset = train_dataset.select(range(val_size, total_size))
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir="./results",
            num_train_epochs=self.num_epochs,
            per_device_train_batch_size=self.batch_size,
            per_device_eval_batch_size=self.batch_size,
            gradient_accumulation_steps=4,
            warmup_ratio=0.1,
            learning_rate=self.learning_rate,
            logging_dir="./logs",
            evaluation_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=2,
            load_best_model_at_end=True,
            fp16=True,
            optim="adamw_torch",
            weight_decay=0.01
        )
        
        # Initialize trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            data_collator=DataCollatorWithPadding(self.tokenizer)
        )
        
        # Train
        trainer.train()
    
    def predict(self, texts: List[str]) -> torch.Tensor:
        """
        Generate predictions with input validation
        """
        # Validate input
        self._validate_inputs(texts)
        
        # Prepare dataset
        dataset = self.prepare_data(texts)
        
        # Create trainer for prediction
        trainer = Trainer(
            model=self.model,
            tokenizer=self.tokenizer
        )
        
        # Get predictions
        predictions = trainer.predict(dataset)
        return torch.nn.functional.softmax(
            torch.tensor(predictions.predictions), 
            dim=1
        )
    
    def semi_supervised_learning(
        self,
        labelled_texts: List[str],
        labelled_labels: List[int],
        unlabelled_texts: List[str],
        confidence_threshold: float = 0.9
    ) -> torch.Tensor:
        """
        Semi-supervised learning pipeline with comprehensive validation
        """
        # Validate inputs
        self._validate_inputs(labelled_texts, labelled_labels)
        self._validate_inputs(unlabelled_texts)
        
        # Initial training
        self.train(labelled_texts, labelled_labels)
        
        # Predict on unlabelled data
        probabilities = self.predict(unlabelled_texts)
        predictions = torch.argmax(probabilities, dim=1)
        max_probs = torch.max(probabilities, dim=1)[0]
        
        # Filter high-confidence predictions
        confident_idx = max_probs >= confidence_threshold
        new_labels = predictions[confident_idx]
        new_texts = [
            unlabelled_texts[i] 
            for i in range(len(unlabelled_texts)) 
            if confident_idx[i]
        ]
        
        # Combine datasets
        all_texts = labelled_texts + new_texts
        all_labels = labelled_labels + new_labels.tolist()
        
        # Retrain
        self.train(all_texts, all_labels)
        
        # Final predictions
        final_probabilities = self.predict(unlabelled_texts)
        return torch.argmax(final_probabilities, dim=1)

In [4]:
hs_df = pd.read_csv("data/hs_dfUnamed.csv")
hs_df.head()

Unnamed: 0,row_id,Row Number,Tweet Treated,Label,Tweet Replaced,Raplaced
0,fffeb77ff91c618cc5482e982240f1af9f09175cddf324...,19999,russia would save a ton of money if they'd pul...,Non Hate,russia would save a ton of money if they'd pul...,False
1,ffe7a960fccf628755ee70ab15e4fab5b45f0f436a2064...,19998,i hate grocery shopping. spent $112. damn you ...,Non Hate,i hate grocery shopping. spent $112. damn you ...,False
2,ffdc308f7c1ceed12d8347ab9150551a9fe155023d624a...,19997,did you miss his blood and soil arguments f...,Non Hate,did you miss his blood and soil arguments f...,False
3,ffdbe9613a9ef0c6c484486e03422ab0bac73f62922005...,19996,"on imperialism, too complex for twitter, but ...",Non Hate,"on imperialism, too complex for twitter, but ...",True
4,ffd75a28b8bf37f681e5d57d1dd1309df03aa3859a3d28...,19995,i'm still wondering why we don't #stop doing #...,Non Hate,i'm still wondering why we don't #stop doing #...,False


In [5]:
hs_df_labelled = hs_df[hs_df["Label"].notnull()]
hs_df_unlabelled = hs_df[hs_df["Label"].isnull()]

hs_df_unlabelled.head()

Unnamed: 0,row_id,Row Number,Tweet Treated,Label,Tweet Replaced,Raplaced
231,fd03af0faea32d6ba18a883aa41d496a0f77a95f0ddfd3...,19768,zelensky and putin both need this lawnmower.,,Tatyana and Tatyana both need this lawnmower.,True
280,fc7449ecd89f35f4faa1549d7e99b49b89419f8a690f28...,19719,zelensky's talking trash. how's that for a p...,,Mikhail's talking trash. how's that for a pr...,True
463,fa08dfe829005b4fe02699c489a5273bc66afa827fb73e...,19536,"yeah, zelensky had good intentions. but th...",,"yeah, Mikhail had good intentions. but the...",True
999,f353c03c58dcb208e9e952a00c216558fb32465708997d...,19000,again i am pretty sure china never called its...,,again i am pretty sure china never called its...,True
1000,f352d8b1a9d1e5ea36d768fb4a8342784c746c5d393e74...,18999,... which sickened &gt;3000 un troops during t...,,... which sickened &gt;3000 un troops during t...,False


In [6]:
hs_df_labelled["Label_bool"] = hs_df_labelled["Label"].apply(lambda x: 1 if x == "Hate" else 0)
hs_df_labelled

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hs_df_labelled["Label_bool"] = hs_df_labelled["Label"].apply(lambda x: 1 if x == "Hate" else 0)


Unnamed: 0,row_id,Row Number,Tweet Treated,Label,Tweet Replaced,Raplaced,Label_bool
0,fffeb77ff91c618cc5482e982240f1af9f09175cddf324...,19999,russia would save a ton of money if they'd pul...,Non Hate,russia would save a ton of money if they'd pul...,False,0
1,ffe7a960fccf628755ee70ab15e4fab5b45f0f436a2064...,19998,i hate grocery shopping. spent $112. damn you ...,Non Hate,i hate grocery shopping. spent $112. damn you ...,False,0
2,ffdc308f7c1ceed12d8347ab9150551a9fe155023d624a...,19997,did you miss his blood and soil arguments f...,Non Hate,did you miss his blood and soil arguments f...,False,0
3,ffdbe9613a9ef0c6c484486e03422ab0bac73f62922005...,19996,"on imperialism, too complex for twitter, but ...",Non Hate,"on imperialism, too complex for twitter, but ...",True,0
4,ffd75a28b8bf37f681e5d57d1dd1309df03aa3859a3d28...,19995,i'm still wondering why we don't #stop doing #...,Non Hate,i'm still wondering why we don't #stop doing #...,False,0
...,...,...,...,...,...,...,...
994,f361154d56c09c343ebc7d0f967c492eca88a0bbcd03fe...,19005,putin gave israeli pm bennett his word he woul...,Non Hate,Maksim gave israeli pm bennett his word he wou...,True,0
995,f35e48848e79e00d6bbbf10ca13c5e2287fcf07d6ecaf6...,19004,does anyone else ever feel like joy-crying whe...,Non Hate,does anyone else ever feel like joy-crying whe...,False,0
996,f35abbe1259d13efb4031780f3226eb5ea20a385d225b2...,19003,nuremberg aint gonna actually happen until the...,Non Hate,nuremberg aint gonna actually happen until the...,False,0
997,f35aace2208a8384202e020cfdb221889de2bcb87dea7d...,19002,their gov is also more corrupt than russi...,Non Hate,their gov is also more corrupt than russi...,False,0


In [8]:
labelled_texts = hs_df_labelled["Tweet Replaced"].tolist()
labelled_labels = hs_df_labelled["Label_bool"].tolist()

unlabelled_texts = hs_df_unlabelled["Tweet Replaced"].tolist()

In [9]:
# Initialize and run
ssl_model = LlamaSemiSupervised(num_labels=2)
predictions = ssl_model.semi_supervised_learning(
    labelled_texts,
    labelled_labels,
    unlabelled_texts
)
print("Predictions:", predictions)

Available GPU Memory: 12.878086144 GB
Initializing model with:
Model: meta-llama/Llama-3.2-1B
Number of Labels: 2


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing dataset with 996 texts and 996 labels


  0%|          | 0/42 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
                                               
 33%|███▎      | 14/42 [01:04<01:59,  4.28s/it]

{'eval_loss': 0.6099501848220825, 'eval_runtime': 2.1021, 'eval_samples_per_second': 47.095, 'eval_steps_per_second': 3.33, 'epoch': 0.98}


  return fn(*args, **kwargs)
                                               
 67%|██████▋   | 28/42 [02:07<01:00,  4.29s/it]

{'eval_loss': 0.6078792810440063, 'eval_runtime': 2.1322, 'eval_samples_per_second': 46.431, 'eval_steps_per_second': 3.283, 'epoch': 1.96}


  return fn(*args, **kwargs)
                                               
100%|██████████| 42/42 [03:09<00:00,  4.30s/it]

{'eval_loss': 0.5143609642982483, 'eval_runtime': 2.1643, 'eval_samples_per_second': 45.743, 'eval_steps_per_second': 3.234, 'epoch': 2.95}


100%|██████████| 42/42 [03:09<00:00,  4.52s/it]


{'train_runtime': 189.861, 'train_samples_per_second': 14.174, 'train_steps_per_second': 0.221, 'train_loss': 0.6058386393955776, 'epoch': 2.95}


  trainer = Trainer(
100%|██████████| 2376/2376 [20:41<00:00,  1.91it/s]


Preparing dataset with 8625 texts and 8625 labels


  return fn(*args, **kwargs)
                                                  
 33%|███▎      | 121/363 [26:11<49:51, 12.36s/it]

{'eval_loss': 1.140370488166809, 'eval_runtime': 45.8647, 'eval_samples_per_second': 18.794, 'eval_steps_per_second': 1.177, 'epoch': 1.0}


  return fn(*args, **kwargs)
                                                   
 67%|██████▋   | 243/363 [52:07<23:21, 11.68s/it]

{'eval_loss': 1.0746480226516724, 'eval_runtime': 45.876, 'eval_samples_per_second': 18.79, 'eval_steps_per_second': 1.177, 'epoch': 2.0}


  return fn(*args, **kwargs)
                                                   
100%|██████████| 363/363 [1:17:49<00:00, 12.92s/it]

{'eval_loss': 1.0968060493469238, 'eval_runtime': 47.7196, 'eval_samples_per_second': 18.064, 'eval_steps_per_second': 1.132, 'epoch': 2.99}


100%|██████████| 363/363 [1:17:50<00:00, 12.87s/it]


{'train_runtime': 4670.0994, 'train_samples_per_second': 4.987, 'train_steps_per_second': 0.078, 'train_loss': 0.009961714100903388, 'epoch': 2.99}


  trainer = Trainer(
100%|██████████| 2376/2376 [19:43<00:00,  2.01it/s]

Predictions: tensor([0, 0, 0,  ..., 0, 0, 0])





In [10]:
import torch

def count_predictions(predictions):
    unique_elements, counts = torch.unique(predictions, return_counts=True)
    return dict(zip(unique_elements.tolist(), counts.tolist()))

# Example usage:
#predictions = torch.tensor([1, 2, 2, 3, 3, 3, 4, 4, 4, 4])
print(count_predictions(predictions))
# Output: {1: 1, 2: 2, 3: 3, 4: 4}

{0: 18053, 1: 951}


In [11]:
predictions_list = predictions.tolist()

unlabelled_pred = pd.DataFrame({'tweet unlabeled': unlabelled_texts, 'pred': predictions_list})
unlabelled_pred

Unnamed: 0,tweet unlabeled,pred
0,Tatyana and Tatyana both need this lawnmower.,0
1,Mikhail's talking trash. how's that for a pr...,0
2,"yeah, Mikhail had good intentions. but the...",0
3,again i am pretty sure china never called its...,0
4,... which sickened &gt;3000 un troops during t...,0
...,...,...
18999,let ukraine take care of itself. this so-...,0
19000,your daily reminder that 141 un countries vo...,0
19001,obvious to...,0
19002,who is this f*cking idiot? biden isn't a neoco...,0


In [12]:
unlabelled_pred.loc[unlabelled_pred['pred'] == 1].to_csv('unlabelled_pred.csv', index=False)

In [14]:
labelled_texts = hs_df_labelled["Tweet Treated"].tolist()
labelled_labels = hs_df_labelled["Label_bool"].tolist()

unlabelled_texts = hs_df_unlabelled["Tweet Treated"].tolist()

In [15]:
from collections import Counter

def count_predictions(predictions):
    return Counter(predictions)

# Example usage:
print(count_predictions(labelled_labels))
# Output: Counter({4: 4, 3: 3, 2: 2, 1: 1})

Counter({0: 790, 1: 206})


In [17]:
predictions_list = ssl_model.predict(unlabelled_texts)

predictions_list = torch.argmax(predictions_list, dim=1)

  trainer = Trainer(
100%|██████████| 2376/2376 [20:43<00:00,  1.91it/s]


In [19]:
from collections import Counter

def count_predictions(predictions):
    return Counter(predictions)

# Example usage:
print(count_predictions(labelled_labels))

Counter({0: 790, 1: 206})


In [None]:

# Save the model state dictionary
torch.save(ssl_model.model.state_dict(), 'models/ssl_model_state_dict.pth')

In [25]:
# Create a new instance of LlamaSemiSupervised
ssl_model_loaded = LlamaSemiSupervised(num_labels=2)

# Load the model state dictionary
ssl_model_loaded.model.load_state_dict(torch.load('models/ssl_model_state_dict.pth', map_location=device))

# Move the model to the appropriate device
ssl_model_loaded.model.to(device)

Available GPU Memory: 12.878086144 GB
Initializing model with:
Model: meta-llama/Llama-3.2-1B
Number of Labels: 2


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  ssl_model_loaded.model.load_state_dict(torch.load('models/ssl_model_state_dict.pth', map_location=device))


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048, padding_idx=128001)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): Mod

In [26]:
predictions_list = ssl_model_loaded.predict(unlabelled_texts)

predictions_list = torch.argmax(predictions_list, dim=1)

  trainer = Trainer(
100%|██████████| 2376/2376 [19:54<00:00,  1.99it/s]


In [28]:
import torch

def count_predictions(predictions):
    unique_elements, counts = torch.unique(predictions, return_counts=True)
    return dict(zip(unique_elements.tolist(), counts.tolist()))

# Example usage:
#predictions = torch.tensor([1, 2, 2, 3, 3, 3, 4, 4, 4, 4])
print(count_predictions(predictions_list))
# Output: {1: 1, 2: 2, 3: 3, 4: 4}

{0: 18303, 1: 701}


In [None]:
ssl_model.model.save_pretrained('models/ssl_model_test')