In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset


In [None]:
!pip install datasets

In [None]:
!nvidia-smi

Tue Feb  4 18:26:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   62C    P0             30W /   70W |   14928MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load the tokenizer and model
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)  # Move model to device

# Load your custom dataset
dataset = load_dataset('json', data_files='/dataset.json')

# Check the dataset structure
print(dataset)

# Create a validation split from the training data
train_test_split = dataset['train'].train_test_split(test_size=0.2)  # 20% for validation

# Rename the splits
train_dataset = train_test_split['train']
validation_dataset = train_test_split['test']

# Check the dataset structure
print(train_dataset)
print(validation_dataset)

# Adjust the tokenization function based on the dataset structure
def tokenize_function(examples):
    # Extract the 'question' and 'answer' from the examples dictionary
    questions = examples['question']
    answers = examples['answer']

    # Tokenize questions and answers
    tokenized_questions = tokenizer(questions, padding="max_length", truncation=True, return_tensors='pt')
    tokenized_answers = tokenizer(answers, padding="max_length", truncation=True, return_tensors='pt')

    # Return the tokenized inputs and labels
    return {
        'input_ids': tokenized_questions['input_ids'],
        'attention_mask': tokenized_questions['attention_mask'],
        'labels': tokenized_answers['input_ids']
    }

# Tokenize the dataset
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_validation = validation_dataset.map(tokenize_function, batched=True)

# Combine tokenized datasets into a single dictionary
tokenized_datasets = {
    'train': tokenized_train,
    'validation': tokenized_validation
}

# Set the format for PyTorch
tokenized_datasets['train'].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_datasets['validation'].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to=["none"],  # Disable W&B logging
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],  # Use the dataset directly
    eval_dataset=tokenized_datasets['validation'],  # Use the dataset directly
)

# Start training
trainer.train()

# Save the model
trainer.save_model("./fine_tuned_model")


Using device: cuda


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 4
    })
})
Dataset({
    features: ['question', 'answer'],
    num_rows: 3
})
Dataset({
    features: ['question', 'answer'],
    num_rows: 1
})


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]



OutOfMemoryError: CUDA out of memory. Tried to allocate 1.09 GiB. GPU 0 has a total capacity of 14.74 GiB of which 166.12 MiB is free. Process 7700 has 14.58 GiB memory in use. Of the allocated memory 13.87 GiB is allocated by PyTorch, and 602.18 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:

# Load the tokenizer and model
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# Load your custom dataset
# Replace 'your_dataset' with the path to your dataset
# Example: dataset = load_dataset('csv', data_files='path/to/your_data.csv')
dataset = load_dataset('json', data_files='/content/sample_data/data.json')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

# Start training
trainer.train()

# Save the model
trainer.save_model("./fine_tuned_model")

In [None]:
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['college', 'department', 'affiliation', 'university', 'hod_message', 'mission', 'history_of_department', 'courses'],
        num_rows: 1
    })
})


In [None]:
prompt = "hi"

inputs = tokenizer(prompt, return_tensors="pt")
generate_ids = model.generate(inputs.input_ids, max_length=30)
output = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

print(output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


covert this python code to rust: f= f, fe, e, efe, x, ex  --> f = f, fe
