Install Necessary Libraries

In [6]:
# Install necessary libraries
!pip install -q transformers accelerate datasets evaluate scikit-learn huggingface_hub pandas
!pip install -U bitsandbytes
!pip install peft

import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from huggingface_hub import login
import torch
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from transformers import BitsAndBytesConfig, AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report
import torch.nn.functional as F




Hugging Face authentication

In [7]:
# Hugging Face authentication
login(token="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")


Load datasets

In [11]:
# File paths for datasets
file_paths = {
    "A": "pitsA.csv",
    "B": "pitsB.csv",
    "C": "pitsC.csv",
    "D": "pitsD.csv",
    "E": "pitsE.csv",
    "F": "pitsF.csv"
}
dfs = {pit: pd.read_csv(path) for pit, path in file_paths.items()}

Data Preprocessing

In [12]:
# Combine all datasets into one
combined_data = pd.concat(dfs.values(), ignore_index=True)

# Check for missing columns
if 'Subject' not in combined_data.columns or 'Description' not in combined_data.columns:
    raise ValueError("Missing 'Subject' or 'Description' column in the dataset.")

# Create 'combined_text' column
combined_data['combined_text'] = (
    combined_data['Subject'].fillna('') + " " + combined_data['Description'].fillna('')
)

# Display dataset sample
print("Dataset sample:")
print(combined_data[['Subject', 'Description', 'combined_text']].head())

# Handle missing values in the 'Severity' column
print("Checking for missing values in 'Severity':")
print(combined_data['Severity'].isnull().sum())

# Drop rows with missing 'Severity' values
combined_data = combined_data.dropna(subset=['Severity'])

# Reindex 'Severity' column to start from 0
label_mapping = {label: idx for idx, label in enumerate(sorted(combined_data['Severity'].unique()))}
combined_data['Severity'] = combined_data['Severity'].map(label_mapping)

Dataset sample:
                                             Subject  \
0                   Build 5.3: Unitialized Variables   
1  Build 5.3 FSW: Typecast Mismatch in Memory Dea...   
2             Build 5.3 FSW: Parameter Type Mismatch   
3             Build 5.3 FSW: Unchecked Return Status   
4  Build 5.3 FSW: Typecast Mismatch in Memory Dea...   

                                         Description  \
0  Filename: sts_df.c Function: TSdf_Undervolt_Co...   
1  File: inflateData.c Function: huft_free Line #...   
2  File: ProjectAmain.c Function: inflateTable Li...   
3  File: acprocesscommands.c Function: AC_Generat...   
4  File: inflateData.c Function: huft_free Line #...   

                                       combined_text  
0  Build 5.3: Unitialized Variables Filename: sts...  
1  Build 5.3 FSW: Typecast Mismatch in Memory Dea...  
2  Build 5.3 FSW: Parameter Type Mismatch File: P...  
3  Build 5.3 FSW: Unchecked Return Status File: a...  
4  Build 5.3 FSW: Typecast Mismatch

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_data['Severity'] = combined_data['Severity'].map(label_mapping)


Data Split

In [13]:
# Split into training and testing sets (80% training, 20% testing)
train_df, test_df = train_test_split(
    combined_data,
    test_size=0.2,
    stratify=combined_data['Severity'],
    random_state=42
)
print(f"Training set size: {len(train_df)}")
print(f"Testing set size: {len(test_df)}")


Training set size: 3220
Testing set size: 805


Tokenization

In [15]:
# Load tokenizer
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set the pad token for the tokenizer and model
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
print(f"Pad token set to: {tokenizer.pad_token}")

train_df['tokenized'] = train_df['combined_text'].apply(lambda x: tokenizer(x, truncation=True, max_length=128))
train_df['input_ids'] = train_df['tokenized'].apply(lambda x: x['input_ids'])
train_df['attention_mask'] = train_df['tokenized'].apply(lambda x: x['attention_mask'])

test_df['tokenized'] = test_df['combined_text'].apply(lambda x: tokenizer(x, truncation=True, max_length=128))
test_df['input_ids'] = test_df['tokenized'].apply(lambda x: x['input_ids'])
test_df['attention_mask'] = test_df['tokenized'].apply(lambda x: x['attention_mask'])

train_df = train_df.drop(columns=['tokenized'])
test_df = test_df.drop(columns=['tokenized'])


Pad token set to: <|end_of_text|>


Create Hugging Face datasets

In [16]:

train_dataset = Dataset.from_pandas(train_df[['input_ids', 'attention_mask', 'Severity']])
test_dataset = Dataset.from_pandas(test_df[['input_ids', 'attention_mask', 'Severity']])

# Rename the 'Severity' column to 'labels' in both train and test datasets
train_dataset = train_dataset.rename_column("Severity", "labels")
test_dataset = test_dataset.rename_column("Severity", "labels")

# Combine into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})


Class weights

In [17]:
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array(sorted(combined_data['Severity'].unique())),
    y=combined_data['Severity']
)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

print("Class weights:", class_weights)

Class weights: tensor([2.6342, 0.4591, 0.7942, 5.4688])


Load the base model

In [18]:
num_classes = len(label_mapping)
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_classes,
    device_map="auto"
)

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Configure PEFT LoRA , Quantization

In [19]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)
model_with_lora = get_peft_model(base_model, lora_config)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = model_with_lora.to('cuda' if torch.cuda.is_available() else 'cpu')

# Set padding token ID in model config
model.config.pad_token_id = tokenizer.pad_token_id

# Prepare data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Define a custom trainer

In [20]:

class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        return (loss, outputs) if return_outputs else loss


Training arguments

In [24]:
training_args = TrainingArguments(
    output_dir="classification_output",
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_steps=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)



Initialize trainer

In [25]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    class_weights=class_weights.to(training_args.device)
)

  super().__init__(*args, **kwargs)


Train Model

In [26]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.4102,1.048348
2,0.9422,0.918298
3,0.2643,0.99225


TrainOutput(global_step=1209, training_loss=0.8651473926452687, metrics={'train_runtime': 1595.1619, 'train_samples_per_second': 6.056, 'train_steps_per_second': 0.758, 'total_flos': 7244917745614848.0, 'train_loss': 0.8651473926452687, 'epoch': 3.0})

Evaluation function

In [27]:

def evaluate_model(test_df, model):
    sentences = test_df['combined_text'].tolist()
    labels = test_df['Severity'].tolist()

    batch_size = 32
    all_outputs = []
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)

    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i + batch_size]
        inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            all_outputs.append(outputs.logits)

    final_outputs = torch.cat(all_outputs, dim=0)
    test_df['predictions'] = final_outputs.argmax(axis=1).cpu().numpy()

    print("Classification Report:")
    print(classification_report(labels, test_df['predictions']))

    print("Balanced Accuracy:", balanced_accuracy_score(labels, test_df['predictions']))
    print("Accuracy:", accuracy_score(labels, test_df['predictions']))

# Evaluate the model
evaluate_model(test_df, model)


Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.77      0.67        77
           1       0.76      0.76      0.76       438
           2       0.63      0.51      0.56       253
           3       0.25      0.43      0.31        37

    accuracy                           0.66       805
   macro avg       0.56      0.62      0.58       805
weighted avg       0.68      0.66      0.67       805

Balanced Accuracy: 0.6160638460370444
Accuracy: 0.6645962732919255
