In [None]:
!pip install kagglehub[pandas-datasets]



In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
from google.colab import files
uploaded = files.upload() 
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
dataset="aayush249/nl2cmd" 
file="data.json"             
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  dataset,
  file
)
print("Dataset loaded successfully!")
print("First 5 records:")
print(df.head())

Saving kaggle.json to kaggle.json


  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/aayush249/nl2cmd?dataset_version_number=1&file_name=chatGPT_generated_data.json...


100%|██████████| 3.44M/3.44M [00:00<00:00, 7.20MB/s]


Dataset loaded successfully!
First 5 records:
                                                        1      \
invocation  Move the first randomly sorted file in the cur...   
cmd                     mv $(ls -A|sort -R|head -1) ~/$RANDOM   

                                                        2      \
invocation  Find all files and search for the string "stri...   
cmd               find -name '*.*' | xargs grep -l '*string*'   

                                                        3      \
invocation                                      Output:\nKV6X   
cmd         printf "%s" $(sed -n "$(shuf -i 1-4 -n 1)p" /d...   

                                                        4      \
invocation  Take one random word from the /usr/share/dict/...   
cmd         shuf -n 1 /usr/share/dict/words | xargs -I{} e...   

                                                        5      \
invocation  Find all files with the extension .jpg in the ...   
cmd           find . -name .\*.jpg -exec

In [None]:
import logging
import json
import re
import os
!pip install datasets

!pip install evaluate
from tqdm import tqdm
from datasets import Dataset
import evaluate
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments,
)
import torch
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


#Data Preparation

def preprocessInvocation(text: str) -> str:
    logger.info("Preprocessing text")
    text = text.strip().lower()
    text = re.sub(r'[^\w\s-]', '', text)
    return text

def loadData(dataDict: dict) -> Dataset:
    logger.info("Preparing data from loaded dictionary")
    samples = []
    for key, value in tqdm(dataDict.items(), desc="Processing samples"):
        try:
            invocation = value["invocation"]
            command = value["cmd"]
        except KeyError as e:
            logger.error(f"Key error: {e}. Check your JSON file structure.")
            continue
        input = "translate English to Bash: " + preprocessInvocation(invocation)
        samples.append({"input": input, "target_text": command})
    logger.info(f"Prepared {len(samples)} samples")
    return Dataset.from_list(samples)


#Tokenization Function

def tokenize(example, tokenizer, max_input_length=32, max_target_length=32):
    modelInputs = tokenizer(
        example["input"],
        max_length=max_input_length,
        truncation=True,
        padding="max_length"  # ensure uniform length
    )
    labels = tokenizer(
        text_target=example["target_text"],
        max_length=max_target_length,
        truncation=True,
        padding="max_length"
    )
    modelInputs["labels"] = labels["input_ids"]
    return modelInputs


#Evaluation Metrics

def metricComputation(pred):
    bleu = evaluate.load("bleu")
    preds = pred.predictions
    labels = pred.label_ids

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip().split() for pred in decoded_preds]
    decoded_labels = [[ref.strip().split()] for ref in decoded_labels]

    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["bleu"]}


# Main Fine-Tuning Routine

def main():
    seed = 42
    os.environ["PYTHONHASHSEED"] = str(seed)

    #Load dataset using KaggleHub.
    datasetPath = "aayush249/nl2cmd"
    filePath = "data.json"
    logger.info("Loading dataset using KaggleHub...")
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
    df = kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS, datasetPath, filePath)
    logger.info("Dataset loaded from KaggleHub.")

    # Transpose the DataFrame (to get 17450 rows instead of 2)
    df = df.transpose()
    logger.info(f"DataFrame shape after transpose: {df.shape}")

    dataDict = df.to_dict(orient="index")
    dataset = loadData(dataDict)

    logger.info("Splitting dataset (90/10 split)...")
    split = dataset.train_test_split(test_size=0.1, seed=seed)
    train_dataset = split["train"]
    val_dataset = split["test"]
    logger.info(f"Training samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}")

    #Load tokenizer and model.
    modelName = "t5-small"
    global tokenizer
    tokenizer = T5Tokenizer.from_pretrained(modelName)
    model = T5ForConditionalGeneration.from_pretrained(modelName)
    logger.info("Loaded model and tokenizer.")

    # Tokenize datasets.
    logger.info("Tokenizing training data...")
    tokenized_train = train_dataset.map(lambda x: tokenize(x, tokenizer),
                                        batched=True,
                                        desc="Tokenizing training data")
    logger.info("Tokenizing validation data...")
    tokenized_val = val_dataset.map(lambda x: tokenize(x, tokenizer),
                                    batched=True,
                                    desc="Tokenizing validation data")

    training_args = TrainingArguments(
    output_dir="./t5_small_model",
    num_train_epochs=3,
    per_device_train_batch_size=1,         # small batch size
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,         # larger effective batch size
    learning_rate=5e-5,
    logging_steps=50,
    eval_strategy="no",                     
    save_steps=600,                         
    fp16=True,
    load_best_model_at_end=False,           
    seed=42,
    report_to=[],                          
)


    logger.info("Setting up Trainer...")
    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,  
    tokenizer=tokenizer,
    compute_metrics=metricComputation, 
)


    torch.cuda.empty_cache()

    logger.info("Starting training...")
    trainer.train()
    logger.info("Training complete. Saving model...")

    output_dir = "./t5_small_model_final"
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    logger.info(f"Model saved to '{output_dir}'")
    eval_results = trainer.evaluate()
    print("Evaluation Results:")
    for key, value in eval_results.items():

      print(f"{key}: {value}")


if __name__ == "__main__":
    main()


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

  df = kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS, dataset_name, file_name)


Downloading from https://www.kaggle.com/api/v1/datasets/download/aayush249/nl2cmd?dataset_version_number=1&file_name=chatgpt_generated_data.json...


100%|██████████| 3.44M/3.44M [00:00<00:00, 7.14MB/s]
Processing samples: 100%|██████████| 17450/17450 [00:00<00:00, 241377.10it/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Tokenizing training data:   0%|          | 0/15705 [00:00<?, ? examples/s]

Tokenizing validation data:   0%|          | 0/1745 [00:00<?, ? examples/s]

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
50,4.802
100,3.4252
150,2.8974
200,2.5317
250,2.3038
300,2.1466
350,2.034
400,1.9049
450,1.8467
500,1.7551


Step,Training Loss
50,4.802
100,3.4252
150,2.8974
200,2.5317
250,2.3038
300,2.1466
350,2.034
400,1.9049
450,1.8467
500,1.7551


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [None]:
!zip -r t5_small_model_final.zip ./t5_small_model_final

  adding: t5_small_model_final/ (stored 0%)
  adding: t5_small_model_final/model.safetensors (deflated 10%)
  adding: t5_small_model_final/tokenizer_config.json (deflated 94%)
  adding: t5_small_model_final/added_tokens.json (deflated 83%)
  adding: t5_small_model_final/generation_config.json (deflated 29%)
  adding: t5_small_model_final/config.json (deflated 62%)
  adding: t5_small_model_final/special_tokens_map.json (deflated 85%)
  adding: t5_small_model_final/spiece.model (deflated 48%)


In [None]:
from google.colab import files
files.download("t5_small_model_final.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

modelPath = "./t5_small_model_final"
model = T5ForConditionalGeneration.from_pretrained(modelPath)
tokenizer = T5Tokenizer.from_pretrained(modelPath)

# Example: Using the model for inference
text = "Find all files in the current directory (maxdepth 1) that do not start with a dot (!) and have a name that contains any characters (*), and execute the command \"chmod +x\" on each of them."
inputs = tokenizer.encode(text, return_tensors="pt", truncation=True, padding="max_length", max_length=64)

outputs = model.generate(inputs, max_length=64, num_beams=5, early_stopping=True)
command = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated command:", command)


Generated command: Find. -maxdepth 1 -type f -name "*" -exec "chmod +x"  ;


In [None]:
import random
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the saved model and tokenizer
modelPath = "./t5_small_model_final"
model = T5ForConditionalGeneration.from_pretrained(modelPath)
tokenizer = T5Tokenizer.from_pretrained(modelPath)

import kagglehub
from kagglehub import KaggleDatasetAdapter

datasetPath = "aayush249/nl2cmd"              
filePath = "chatgpt_generated_data.json"      
print("Loading dataset from KaggleHub...")
df = kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS, datasetPath, filePath)
print("Dataset loaded.")

df = df.transpose()
print("DataFrame shape after transpose:", df.shape)
print("Sample of original data:")
print(df.head())

# Randomly sample a few examples 
randomSamples = df.sample(n=5, random_state=42)

# Run inference on each random sample
print("\nRunning inference on random samples:\n")
for idx, row in randomSamples.iterrows():
    invocation = row["invocation"]
    input_text = "translate English to Bash: " + invocation

    # Tokenize the input
    inputs = tokenizer.encode(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=64
    )

    # Generate output command using the model
    outputs = model.generate(
        inputs,
        max_length=64,
        num_beams=5,
        early_stopping=True
    )

    # Decode the generated tokens to text
    command = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"Sample index: {idx}")
    print("Invocation:", invocation)
    print("Generated Command:", command)
    print("-" * 50)


Loading dataset from KaggleHub...


  df = kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS, dataset_name, file_name)


Dataset loaded.
DataFrame shape after transpose: (17450, 2)
Sample of original data:
                                          invocation  \
1  Move the first randomly sorted file in the cur...   
2  Find all files and search for the string "stri...   
3                                      Output:\nKV6X   
4  Take one random word from the /usr/share/dict/...   
5  Find all files with the extension .jpg in the ...   

                                                 cmd  
1              mv $(ls -A|sort -R|head -1) ~/$RANDOM  
2        find -name '*.*' | xargs grep -l '*string*'  
3  printf "%s" $(sed -n "$(shuf -i 1-4 -n 1)p" /d...  
4  shuf -n 1 /usr/share/dict/words | xargs -I{} e...  
5    find . -name .\*.jpg -exec mv \{\} /tmp/\{\} \;  

Running inference on random samples:

Sample index: 15154
Invocation: Find all files in the home directory (recursively) and create an archive called "archive.tar" containing all of them.
Generated Command: find /home -type f -exec chmod "archive.

In [None]:
import random
import pandas as pd
import evaluate
from transformers import T5ForConditionalGeneration, T5Tokenizer
import kagglehub
from kagglehub import KaggleDatasetAdapter


# Load Model and Tokenizer

modelPath = "./t5_small_model_final"
model = T5ForConditionalGeneration.from_pretrained(modelPath)
tokenizer = T5Tokenizer.from_pretrained(modelPath)



datasetPath = "aayush249/nl2cmd"            
filePath = "chatgpt_generated_data.json"       
print("Loading dataset from KaggleHub...")
df = kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS, datasetPath, filePath)
print("Dataset loaded.")


df = df.transpose()
print("DataFrame shape after transpose:", df.shape)
print("Sample of original data:")
print(df.head())


randomSamples = df.sample(n=200, random_state=42)


#Run Inference and Collect Predictions & References

predictions = []
references = []

for idx, row in randomSamples.iterrows():
    invocation = row["invocation"]
    ground_truth = row["cmd"]
    input_text = "translate English to Bash: " + invocation
    inputs = tokenizer.encode(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=64
    )

    outputs = model.generate(
        inputs,
        max_length=64,
        num_beams=5,
        early_stopping=True
    )
    generated_command = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predictions.append(generated_command.strip())
    references.append([ground_truth.strip()])



Loading dataset from KaggleHub...


  df = kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS, dataset_name, file_name)


Dataset loaded.
DataFrame shape after transpose: (17450, 2)
Sample of original data:
                                          invocation  \
1  Move the first randomly sorted file in the cur...   
2  Find all files and search for the string "stri...   
3                                      Output:\nKV6X   
4  Take one random word from the /usr/share/dict/...   
5  Find all files with the extension .jpg in the ...   

                                                 cmd  
1              mv $(ls -A|sort -R|head -1) ~/$RANDOM  
2        find -name '*.*' | xargs grep -l '*string*'  
3  printf "%s" $(sed -n "$(shuf -i 1-4 -n 1)p" /d...  
4  shuf -n 1 /usr/share/dict/words | xargs -I{} e...  
5    find . -name .\*.jpg -exec mv \{\} /tmp/\{\} \;  


In [None]:
# 5. Compute Metrics
# Load BLEU, ROUGE, and METEOR metrics using the evaluate library.
!pip install rouge_score
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")


bleu_result = bleu_metric.compute(predictions=predictions, references=references)

rouge_result = rouge_metric.compute(predictions=predictions, references=references)

meteor_result = meteor_metric.compute(predictions=predictions, references=references)

print("BLEU Score:", bleu_result["bleu"])
print("\nROUGE Scores:")
for key, value in rouge_result.items():
    print(f"  {key}: {value}")
print("\nMETEOR Score:", meteor_result["meteor"])


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=b5387b9612cd667ae737f2311142d4e97d7bc779b7cfe6efe0a54e7cd4315a85
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


BLEU Score: 0.21473919544021228

ROUGE Scores:
  rouge1: 0.5540621220631554
  rouge2: 0.37940470064421306
  rougeL: 0.539885414012069
  rougeLsum: 0.5393880882134745

METEOR Score: 0.3743518629124182


In [None]:
import transformers
print(transformers.__version__)

4.48.3
