In [2]:
from transformers import AutoTokenizer, Qwen2ForCausalLM

model = Qwen2ForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")

prompt = "Hey, are you conscious? Can you talk to me?"
inputs = tokenizer(prompt, return_tensors="pt")

# Generate
generate_ids = model.generate(inputs.input_ids, max_length=30)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Both `max_new_tokens` (=2048) and `max_length`(=30) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


"Hey, are you conscious? Can you talk to me? I'm a robot. I'm a little bit confused. What do you want me to do? I'm sorry, but I'm not a robot. I'm a computer program. I don't have emotions or consciousness like humans do. I'm here to help you with any questions or tasks you have. If you have any questions, feel free to ask."

In [8]:
from datasets import load_dataset, DatasetDict

# Load the dataset from Hugging Face Hub
dataset_id = "ucirvine/sms_spam"
print(f"Loading dataset: {dataset_id}")
raw_datasets = load_dataset(dataset_id)

# --- Check dataset structure ---
print("\nDataset structure:")
print(raw_datasets)
print("\nSample data:")
print(raw_datasets['train'][0])
# Expected columns: 'label' (int: 0=ham, 1=spam), 'sms' (string)

# --- Split the dataset ---
# The dataset only has a 'train' split, so we create train/test splits
train_test_split = raw_datasets['train'].train_test_split(test_size=0.2, seed=42, stratify_by_column='label') # 80% train, 20% test

# Create a new DatasetDict with 'train' and 'test' splits
split_datasets = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})
print("\nDataset structure after splitting:")
print(split_datasets)

# Define label mapping (matches the dataset's 0/1 scheme)
num_labels = 2
id2label = {0: "ham", 1: "spam"}
label2id = {"ham": 0, "spam": 1}

print(f"\nNumber of labels: {num_labels}")
print(f"id2label mapping: {id2label}")

Loading dataset: ucirvine/sms_spam


Generating train split: 100%|██████████| 5574/5574 [00:00<00:00, 522238.49 examples/s]


Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 5574
    })
})

Sample data:
{'sms': 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n', 'label': 0}

Dataset structure after splitting:
DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 4459
    })
    test: Dataset({
        features: ['sms', 'label'],
        num_rows: 1115
    })
})

Number of labels: 2
id2label mapping: {0: 'ham', 1: 'spam'}





In [9]:
from transformers import AutoTokenizer

model_id = "Qwen/Qwen2-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# Set padding token if not set
if tokenizer.pad_token is None:
    if tokenizer.eos_token:
        tokenizer.pad_token = tokenizer.eos_token
        print(f"Set pad_token to eos_token: {tokenizer.pad_token}")
    else:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        print("Added new [PAD] token.")

def tokenize_function(examples):
    # Use the 'sms' column from the dataset
    return tokenizer(examples["sms"], padding="max_length", truncation=True, max_length=128) # Increased max_length slightly just in case

print("\nTokenizing dataset...")
tokenized_datasets = split_datasets.map(tokenize_function, batched=True)

# Remove original sms column and any other unnecessary columns
tokenized_datasets = tokenized_datasets.remove_columns(["sms"])
tokenized_datasets.set_format("torch")

print("Sample tokenized data:", tokenized_datasets['train'][0])


Tokenizing dataset...


Map: 100%|██████████| 4459/4459 [00:00<00:00, 19700.53 examples/s]
Map: 100%|██████████| 1115/1115 [00:00<00:00, 19028.95 examples/s]

Sample tokenized data: {'label': tensor(0), 'input_ids': tensor([    35,    544,    498,    614,   1943,   3010,    198, 151643, 151643,
        151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
        151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
        151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
        151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
        151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
        151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
        151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
        151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
        151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
        151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
        151643, 151643, 151643, 151643, 151643, 151643, 151643,


