In [1]:
!pip install -U -q transformers datasets bitsandbytes trl peft

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

#model_name = "cognitya/llama-2-1B-instruct-edited"
model_name = "meta-llama/Llama-3.2-1B-Instruct"

config_8bit = BitsAndBytesConfig(load_in_8bit=True)

model_8bit = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=config_8bit,
    device_map="auto",
    trust_remote_code=True,
)

In [4]:
model_8bit

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear8bitLt(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear8bitLt(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear8bitLt(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear8bitLt(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMS

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", trust_remote_code=True)

In [6]:
from datasets import load_dataset
dataset = load_dataset("MattCoddity/dockerNLcommands")

In [7]:
dataset['train'][0]

{'input': 'Give me a list of containers that have the Ubuntu image as their ancestor.',
 'output': "docker ps --filter 'ancestor=ubuntu'",
 'instruction': 'translate this sentence in docker command'}

In [8]:
from datasets import DatasetDict
train_test_split = dataset['train'].train_test_split(test_size=0.2,seed=42)
dataset = DatasetDict(
    {
        'train':train_test_split['train'], 
        'validation' :train_test_split['test']
    }
)
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 1932
    })
    validation: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 483
    })
})

In [9]:
def chat_template(example):
    mesagges =[
        {'role':'system', 'content':example['instruction']},
        {'role':'user', 'content':example['input']},
        {'role':'assistant', 'content':example['output']},
    ]

    return {'text':mesagges}

dataset = dataset.map(chat_template)
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction', 'text'],
        num_rows: 1932
    })
    validation: Dataset({
        features: ['input', 'output', 'instruction', 'text'],
        num_rows: 483
    })
})

In [10]:
dataset['train']['text'][0]

[{'content': 'translate this sentence in docker command', 'role': 'system'},
 {'content': 'Find the repository, tag, and ID of the images that were created before the latest nginx image.',
  'role': 'user'},
 {'content': 'docker images -f "before=nginx:latest" --format "{{.Repository}},{{.Tag}},{{.ID}}"',
  'role': 'assistant'}]

In [11]:
tokenizer_mistral = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
tokenizer_mistral.apply_chat_template(dataset['train']['text'], tokenize=False)

['<s> [INST] translate this sentence in docker command\n\nFind the repository, tag, and ID of the images that were created before the latest nginx image. [/INST] docker images -f "before=nginx:latest" --format "{{.Repository}},{{.Tag}},{{.ID}}"</s>',
 "<s> [INST] translate this sentence in docker command\n\nPlease show me the Docker containers that have exited and are related to the mongo image. [/INST] docker ps -a --filter 'status=exited' --filter 'ancestor=mongo'</s>",
 '<s> [INST] translate this sentence in docker command\n\nTo gain access, submit your username and password from the text file for successful login. [/INST] "docker login --username=johndoe --password-stdin < ~/mypassword.txt"</s>',
 '<s> [INST] translate this sentence in docker command\n\nCease the execution of confusioner. [/INST] "docker kill confusioner"</s>',
 "<s> [INST] translate this sentence in docker command\n\nList all running containers created later than 3e33ad9a0b3e. [/INST] docker ps --filter 'since=3e3

In [12]:
tokenizer.apply_chat_template(dataset['train']['text'][0],tokenize=False)

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 07 Apr 2025\n\ntranslate this sentence in docker command<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nFind the repository, tag, and ID of the images that were created before the latest nginx image.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\ndocker images -f "before=nginx:latest" --format "{{.Repository}},{{.Tag}},{{.ID}}"<|eot_id|>'

In [13]:
def apply_chat_temp(example):
    new_text = tokenizer.apply_chat_template(example['text'],tokenize=False)

    return {'text':new_text}


dataset = dataset.map(apply_chat_temp)
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction', 'text'],
        num_rows: 1932
    })
    validation: Dataset({
        features: ['input', 'output', 'instruction', 'text'],
        num_rows: 483
    })
})

In [14]:
dataset['train']['text'][0]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 05 Apr 2025\n\ntranslate this sentence in docker command<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nFind the repository, tag, and ID of the images that were created before the latest nginx image.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\ndocker images -f "before=nginx:latest" --format "{{.Repository}},{{.Tag}},{{.ID}}"<|eot_id|>'

In [15]:
tokenizer(dataset['train']['text'][0])

{'input_ids': [128000, 128000, 128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 198, 15724, 2696, 25, 220, 2304, 5186, 220, 2366, 20, 271, 14372, 420, 11914, 304, 27686, 3290, 128009, 128006, 882, 128007, 271, 10086, 279, 12827, 11, 4877, 11, 323, 3110, 315, 279, 5448, 430, 1051, 3549, 1603, 279, 5652, 71582, 2217, 13, 128009, 128006, 78191, 128007, 271, 29748, 5448, 482, 69, 330, 15145, 28, 74661, 25, 19911, 1, 1198, 2293, 48319, 13, 4727, 39254, 3052, 13, 5786, 39254, 3052, 13, 926, 24275, 128009], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [16]:
def tokenize_fn(example):
    return tokenizer(example['text'])


tokenized_dataset = dataset.map(tokenize_fn, batched=True, remove_columns=['input','output','instruction','text'])

tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1932
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 483
    })
})

In [17]:
#The data collector ensures that all input vectors are aligned to the same size, as the GPU expects inputs of uniform dimensions. To achieve this, the data collector pads each vector to match the size of the largest one.

In [18]:
from transformers import DataCollatorForLanguageModeling

data_collector = DataCollatorForLanguageModeling(tokenizer,mlm=False, return_tensors="pt")

In [19]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token

'<|eot_id|>'

In [20]:
from peft import LoraConfig, get_peft_model
import copy

model_8bit_clone = copy.deepcopy(model_8bit)

lora_config = LoraConfig(r=32, lora_alpha=32, target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
                          lora_dropout=0.05, bias="none", task_type="CAUSAL_LM")


model_8bit_lora = get_peft_model(model_8bit_clone, lora_config)

model_8bit_lora.print_trainable_parameters()

trainable params: 22,544,384 || all params: 1,258,358,784 || trainable%: 1.7916


In [28]:
base_model = AutoModelForCausalLM.from_pretrained(


                                                  model_name,

                                                  device_map= "auto",
                                                  trust_remote_code =True

                                                  )

In [21]:
model_8bit

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear8bitLt(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear8bitLt(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear8bitLt(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear8bitLt(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMS

In [22]:
model_8bit_lora

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj):

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer


training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    eval_steps=10,
    # evaluation_strategy="steps",
    save_steps=20,
    save_strategy="steps",
    # num_train_epochs=1,
    max_steps=60,
    learning_rate=3e-5,
    weight_decay=0.01,
    warmup_steps=5,
    fp16=True,
    gradient_accumulation_steps=4,
    optim="adamw_torch",
    logging_steps=10,
    report_to="none"
)


In [24]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1932
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 483
    })
})

In [25]:
trainer = SFTTrainer(

                     model =model_8bit_lora,
                     train_dataset =tokenized_dataset['train'],
                     eval_dataset = tokenized_dataset['validation'],
                     args = training_args,
                     data_collator= data_collector,
                      processing_class = tokenizer,
)

trainer.train()

Truncating train dataset:   0%|          | 0/1932 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/483 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss
10,4.438
20,2.66
30,1.8954
40,1.4473
50,1.2783
60,1.1975


TrainOutput(global_step=60, training_loss=2.1527602513631185, metrics={'train_runtime': 42.7572, 'train_samples_per_second': 11.226, 'train_steps_per_second': 1.403, 'total_flos': 221401738690560.0, 'train_loss': 2.1527602513631185})

In [None]:
model_8bit_lora

In [None]:
# base_model.push_to_hub("abutair1/Llama3.2-doker-egitim")
# tokenizer.push_to_hub("abutair1/Llama3.2-doker-egitim")

In [26]:
dataset['train'][1]

{'input': 'Please show me the Docker containers that have exited and are related to the mongo image.',
 'output': "docker ps -a --filter 'status=exited' --filter 'ancestor=mongo'",
 'instruction': 'translate this sentence in docker command',
 'text': "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 05 Apr 2025\n\ntranslate this sentence in docker command<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nPlease show me the Docker containers that have exited and are related to the mongo image.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\ndocker ps -a --filter 'status=exited' --filter 'ancestor=mongo'<|eot_id|>"}

In [29]:
import torch
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=base_model,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    tokenizer = tokenizer
)
messages = [
    {"role": "system", "content": "translate this sentence in docker command"},
    {"role": "user", "content": "Please show me the Docker containers that have exited and are related to the mongo image."},
]
outputs = pipe(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1]['content'])

Device set to use cuda:0


To find Docker containers that have exited and are related to the MongoDB image, you can use the `docker ps` command with the `-a` option to list all containers, and then filter the output to only include containers that have exited. Here's an example:

```bash
docker ps -a | grep mongo
```

This will list all containers that have exited, and you can filter the output to only include containers that are related to the MongoDB image by checking the `Image` field in the output. For example:

```bash
docker ps -a | grep mongo | grep -v 'latest'
```

This will list all containers that are related to the MongoDB image (i.e., they have a different `Image` field than `latest`), and exclude any containers that are the latest image.

Alternatively, you can use the `docker ps` command with the `-f` option to filter the output by a specific field. For example:

```bash
docker ps -a -f image= mongo
```

This will list all containers that have exited, and only include containers that have an `Image

In [30]:
import torch
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=base_model,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    tokenizer = tokenizer
)
messages = [
    {"role": "system", "content": "translate this sentence in docker command"},
    {"role": "user", "content": "Please show me the Docker containers that have exited and are related to the mongo image."},
]
outputs = pipe(
    messages,
    max_new_tokens=256,
    temperature = 0.5,
    top_p  = 0.9,
    top_k = 10,
    do_sample = True,
    repetition_penalty = 1.2


)
print(outputs[0]["generated_text"][-1]['content'])

Device set to use cuda:0


To get a list of all running or stopped MongoDB containers, you can use the following Docker commands:

1. To see all running containers:
```bash
docker ps -a
```
This will display a list of all containers currently running on your system.

2. To see only container names (not their status):
```bash
docker ps --all
```

3. To stop a specific container by name:
```bash
docker stop <container_name>
```
Replace `<container_name>` with the actual name of the container you want to stop.

4. To start a new MongoDB instance from scratch:
```bash
docker run -d --name my-mongo-instance \
    -p 27017:27017 \# Run a new MongoDB server
```
Here's what each option does:

* `-d`: runs the container in detached mode, so it won't appear as an item in `docker ps`.
* `--name`: gives the container a unique name.
* `-p` : maps port 27017 inside the container to port 27017 outside the container for access via http://localhost/27018
* `\ # Use a comment here if needed.` 
* `my-mongo-instance`: give a name t