In [3]:
# Install necessary libraries
!pip install transformers datasets peft accelerate bitsandbytes sentencepiece
!pip install gradio  # for the demo UI
!pip install kaggle




In [5]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content/'

# Upload kaggle.json
from google.colab import files
files.upload()

# Download your dataset (replace with actual dataset name)
!kaggle datasets download merishnasuwal/aircraft-historical-maintenance-dataset


Saving kaggle.json to kaggle (3).json
Dataset URL: https://www.kaggle.com/datasets/merishnasuwal/aircraft-historical-maintenance-dataset
License(s): DbCL-1.0
aircraft-historical-maintenance-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [6]:
!unzip /content/aircraft-historical-maintenance-dataset.zip -d /content/dataset


Archive:  /content/aircraft-historical-maintenance-dataset.zip
replace /content/dataset/Aircraft_Annotation_DataFile.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/dataset/Aviation_Abbreviation_Dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/dataset/Aviation_Morphosyntactic_Dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/dataset/Aviation_TermBanks_Dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/dataset/Aviation_grammar_Dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [7]:
import pandas as pd

df = pd.read_csv('/content/dataset/Aircraft_Annotation_DataFile.csv')
df.head()


Unnamed: 0,IDENT,PROBLEM,ACTION
0,100001,ENGINE IDLE OVERRIDE KILLED ENGINE.,"TRIED TO ADJUST IDLE SEVERAL TIMES, WOULDN'T A..."
1,100002,ENGINE IDLE OVERRIDE KILLED ENGINE.,REMOVED & REPLACED FUEL SERVO
2,100003,ENGINE IDLE OVERRIDE KILLED ENGINE.,"A/C WAS RUN UP, SET IDLE SPEED, MIXTURE OK, NO..."
3,100004,HAD ENGINE CHOKE & BRIEFLY LOSE POWER ON DEPAR...,"PERFORMED ENGINE RUN UP, FOUND CYL 2 LOWER PLU..."
4,100005,#2 & 4 CYL ROCKER COVER GASKETS ARE LEAKING.,REMOVED & REPLACED GASKETS.


In [8]:
data = []
for _, row in df.iterrows():
    instruction = f"Problem: {row['PROBLEM']}. What should the technician do?"
    response = row['ACTION']
    data.append({"instruction": instruction, "response": response})

# Optional: save to JSONL for Hugging Face
import json
with open("tech_data.jsonl", "w") as f:
    for item in data:
        json.dump(item, f)
        f.write("\n")


In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "mosaicml/mpt-7b-instruct"  # can replace with another open-source LLM
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,   # reduces memory usage
    device_map='auto'
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=None,  # Auto-detect
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)




In [11]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="tech_data.jsonl")
dataset = dataset["train"].train_test_split(test_size=0.2)


Generating train split: 0 examples [00:00, ? examples/s]

In [12]:
from datasets import load_dataset


def preprocess(example):
    prompt = f"{example['instruction']}\nAnswer:"
    # tokenize input
    inputs = tokenizer(prompt, truncation=True, max_length=256)
    # tokenize target (response)
    labels = tokenizer(example['response'], truncation=True, max_length=256)
    # assign labels
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess, remove_columns=["instruction", "response"])


Map:   0%|          | 0/4935 [00:00<?, ? examples/s]

Map:   0%|          | 0/1234 [00:00<?, ? examples/s]

In [13]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # causal LM, not masked LM
)


In [14]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./lora_tech_model",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    warmup_steps=10,
    max_steps=200,
    learning_rate=3e-4,
    fp16=True,
    logging_steps=10,
    eval_steps=50,
    save_steps=50,
    save_total_limit=2,
    report_to=[]  # disables wandb
)



In [15]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Trainer(


In [17]:
# Make PAD token the same as EOS
tokenizer.pad_token = tokenizer.eos_token


In [18]:
model.config.pad_token_id = tokenizer.pad_token_id


In [19]:
trainer.train()


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Step,Training Loss
10,2.8351
20,1.3746
30,1.0531
40,0.9272
50,0.9027
60,0.8506
70,0.9792
80,0.7917
90,0.8174
100,0.7991




TrainOutput(global_step=200, training_loss=0.9550248456001281, metrics={'train_runtime': 1005.3816, 'train_samples_per_second': 1.591, 'train_steps_per_second': 0.199, 'total_flos': 1678040803565568.0, 'train_loss': 0.9550248456001281, 'epoch': 0.3242147922998987})

In [20]:
prompt = "Problem: Engine vibration at takeoff. What should the technician do?"

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0]))


Problem: Engine vibration at takeoff. What should the technician do?
Answer: The technician should check the engine for proper oil pressure and verify that the engine is running smoothly. If the engine is running smoothly, the technician should check the engine mounts for looseness or damage. If the engine mounts are loose or damaged, the technician should tighten or replace the mounts, as needed. If the engine mounts are tight and undamaged, the technician should check the engine for other possible causes of the vibration, such as a damaged propeller, damaged engine mounts


In [21]:
import gradio as gr

def tech_helper(problem):
    prompt = f"Problem: {problem}. What should the technician do?"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=100)
    return tokenizer.decode(outputs[0])

iface = gr.Interface(fn=tech_helper, inputs="text", outputs="text")
iface.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7fc2e15d8a6b515965.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


