In [1]:
import pandas as pd
from transformers import AutoTokenizer
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
finetuning_dataset_df = pd.read_json("../data/final/true/finetuning_dataset.jsonl", lines = True)
finetuning_dataset_dict = finetuning_dataset_df.to_dict()
pprint(finetuning_dataset_dict)

{'input': {0: '\n'
              'Application Form: If you would like to apply for the lease, '
              'please visit: \n'
              'http: //T-app.com.au/RhConcord\n'
              '\n'
              'Situated in only a block of 6 units, this double brick building '
              'presents in great condition throughout. Conveniently, it is '
              'situated in a well maintained security complex and is located '
              'within walking distance to Meadowbank train station, Rivercat & '
              'Shepherds Bay shopping village.\n'
              '\n'
              'Features include: -2 generously sized bedrooms\n'
              '-Spacious combined lounge & dining room\n'
              '-Good size bathroom\n'
              '-Spacious kitchen with cooktop and plenty of storage space\n'
              '-North facing balcony overlook greens \n'
              '-Undercover car space\n'
              '\n'
              'A perfect opportunity to secure a home today!\n

#### **Loading the loaded data into an array**

In [3]:
prompt_template = """
### Input:
{input_text}

### Output:
"""

In [4]:
num_datapoints = len(finetuning_dataset_dict["input"])
num_datapoints

100

In [5]:
finetuning_dataset = []

for i in range(num_datapoints):
    description = finetuning_dataset_dict["input"][i]
    output = finetuning_dataset_dict["output"][i]
    input_text_prompt = prompt_template.format(input_text = description)

    finetuning_dataset.append({
        "input": input_text_prompt,
        "output": output
    })

In [6]:
pprint(finetuning_dataset[0])

{'input': '\n'
          '### Input:\n'
          '\n'
          'Application Form: If you would like to apply for the lease, please '
          'visit: \n'
          'http: //T-app.com.au/RhConcord\n'
          '\n'
          'Situated in only a block of 6 units, this double brick building '
          'presents in great condition throughout. Conveniently, it is '
          'situated in a well maintained security complex and is located '
          'within walking distance to Meadowbank train station, Rivercat & '
          'Shepherds Bay shopping village.\n'
          '\n'
          'Features include: -2 generously sized bedrooms\n'
          '-Spacious combined lounge & dining room\n'
          '-Good size bathroom\n'
          '-Spacious kitchen with cooktop and plenty of storage space\n'
          '-North facing balcony overlook greens \n'
          '-Undercover car space\n'
          '\n'
          'A perfect opportunity to secure a home today!\n'
          '\n'
          '\n'
    

#### **Tokenizing the entire dataset**

In [7]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
def tokenize_datapoint(finetuning_dataset_dict):
    # Concating input and output
    description = finetuning_dataset_dict["input"][0]
    output = finetuning_dataset_dict["output"][0]
    text = description + output

    # A way of saying tokenize with zeros
    tokenizer.pad_token = tokenizer.eos_token

    # Padding the entire prompt
    tokenized_inputs = tokenizer(
        text = text,
        return_tensors = "np",
        padding = True
    )

    # print(tokenized_inputs["input_ids"].shape[1])
    # print(tokenized_inputs["input_ids"])

    # Determining truncation size
    max_length = min(tokenized_inputs["input_ids"].shape[1], 2048)

    #Truncating the entire prompt if more than 2048
    tokenized_inputs = tokenizer(
        text = text,
        return_tensors = "np",
        truncation = True,
        max_length = max_length
    )

    return tokenized_inputs

In [9]:
tokenize_datapoint(finetuning_dataset[0])

{'input_ids': array([[187,  60]]), 'attention_mask': array([[1, 1]])}

### Training

In [10]:
from datasets import load_dataset

In [11]:
finetuning_dataset_hf = load_dataset("json", data_files="../data/final/true/finetuning_dataset.jsonl", split="train")

tokenized_dataset = finetuning_dataset_hf.map(
    tokenize_datapoint,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

print(tokenized_dataset)

Dataset({
    features: ['input', 'output', 'input_ids', 'attention_mask'],
    num_rows: 100
})


In [12]:
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

In [13]:
print(tokenized_dataset)

Dataset({
    features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})


In [14]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 90
    })
    test: Dataset({
        features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10
    })
})


In [17]:
from transformers import AutoModelForCausalLM, TrainingArguments
import logging
import torch

logger = logging.getLogger(__name__)
global_config = None

In [18]:
model_name = "EleutherAI/pythia-70m"

In [19]:
base_model = AutoModelForCausalLM.from_pretrained(model_name)

In [20]:
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]
print(train_dataset)
print(test_dataset)

Dataset({
    features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 90
})
Dataset({
    features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10
})


In [21]:
device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

In [22]:
base_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [23]:
def inference(text, model, tokenizer, max_input_tokens = 1000, max_output_tokens = 600):
    input_ids = tokenizer.encode(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=max_input_tokens
    )

    device = model.device

    generated_output_tokens = model.generate(
        input_ids = input_ids.to(device),
        max_length = max_output_tokens
    )

    # Decode
    generated_output_text = tokenizer.batch_decode(generated_output_tokens, skip_special_tokens=True)

    # Strip the prompt
    generated_text_answer = generated_output_text[0][len(text):]

    return generated_text_answer

In [24]:
test_text = test_dataset[0]['input']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {test_dataset[0]['output']}")
print("Model's answer: ")
print(inference(test_text, base_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): 
Situated on a peaceful street in the tranquil heart of one of Adelaide's most exclusive suburbs, this elegant executive offers a spacious and secure easy-care lifestyle cocooned in light and warmth from the north. 

The gracious frontage embraced by a garden brimming with fragrant roses presents a modern take on classical Georgian architecture, reflected in design elements that include architraves, bespoke cabinetry, paned sash windows and French doors gracing the interior that has been freshly painted and recarpeted. 

A stylish lobby introduces a refined formal living room with pretty garden backdrops and French doors opening to a north facing central courtyard.

Privately situated, the primary suite gazes over the front garden towards the distant hills and includes built-in robes and an ensuite bathroom. Two further bedrooms positioned along a rear gallery are generous in size and share a family bathroom with bath.

Forming the central heart of the home is th

#### Starting training

In [1]:
max_steps = 500

In [26]:
trained_model_name = f"rev_desc_to_tags_{max_steps}_steps"
output_dir = trained_model_name

In [27]:
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=1,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)

In [28]:
model_flops = (
  base_model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, 2048)
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

print(base_model)
print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [29]:
from utilities import Trainer

In [30]:
trainer = Trainer(
    model=base_model,
    model_flops=model_flops,
    total_steps=max_steps,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    args=training_args
)

In [None]:
training_output = trainer.train()

In [32]:
save_dir = f'{output_dir}/final'
trainer.save_model(save_dir)
print("Saved model to:", save_dir)

Saved model to: rev_desc_to_tags_500_steps/final


In [33]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)

In [34]:
finetuned_slightly_model.to(device) 

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [45]:
test_question = test_dataset[0]['input']
print("Question input (test):", test_question)

print("Finetuned slightly model's answer: ")
print(inference(test_question, finetuned_slightly_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): 
Situated on a peaceful street in the tranquil heart of one of Adelaide's most exclusive suburbs, this elegant executive offers a spacious and secure easy-care lifestyle cocooned in light and warmth from the north. 

The gracious frontage embraced by a garden brimming with fragrant roses presents a modern take on classical Georgian architecture, reflected in design elements that include architraves, bespoke cabinetry, paned sash windows and French doors gracing the interior that has been freshly painted and recarpeted. 

A stylish lobby introduces a refined formal living room with pretty garden backdrops and French doors opening to a north facing central courtyard.

Privately situated, the primary suite gazes over the front garden towards the distant hills and includes built-in robes and an ensuite bathroom. Two further bedrooms positioned along a rear gallery are generous in size and share a family bathroom with bath.

Forming the central heart of the home is th

In [49]:
test_prompt = """
16/110 Reserve Road for Auction
With entry via quiet Milner Road and tucked serenely at the rear of the block, this immaculately presented and beautifully updated townhouse presents a desirable ready-made way of life. The quality double brick property is exceptionally well built and designed, framed in courtyards on both sides with stylish engineered flooring, a modern kitchen, powder room and direct internal access to its over-height and generous double lock up garage. Three bright bedrooms provide options for the family or a home office and include the master suite with a balcony and near new ensuite. The tightly held property enjoys a delightful garden outlook and benefits from ample visitor parking. This much loved address sits across from Thomson Park, steps to Artarmon Public School, the bus, village and station.

- Back of block position away from Reserve Road
- Private entry through a front courtyard, generous balconies
- Floorboards, bright and inviting interiors
- Open plan living and dining, large modern kitchen, dishwasher
- Rear entertainer's terraces and easy care garden
- Stylish powder room with a renovated laundry
- Three upper level bedrooms, contemporary baths
- Master suite enjoys a renovated ensuite and balcony
- Internal access to the over-height double lock up garage, storage
- Gas connection for heating/cooking, central common gardens, ample visitor parking
- Approx. 8 minute walk to Artarmon Village shops and train station
- Approx. 4 minute walk to Artarmon Public
- Moments to Chatswood CBD, shopping and dining district
- Within Artarmon Public and Chatswood High school catchment zone
- Unit size 164 sqm + 37 sqm double lock up garage
- Strata rate approx. $1,470 pq

For more information, please visit www.rwayrealtychatswood.com.au
"""

print("Question input (test):", test_prompt)

print("Finetuned slightly model's answer: ")
print(inference(test_prompt, finetuned_slightly_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): 
16/110 Reserve Road for Auction
With entry via quiet Milner Road and tucked serenely at the rear of the block, this immaculately presented and beautifully updated townhouse presents a desirable ready-made way of life. The quality double brick property is exceptionally well built and designed, framed in courtyards on both sides with stylish engineered flooring, a modern kitchen, powder room and direct internal access to its over-height and generous double lock up garage. Three bright bedrooms provide options for the family or a home office and include the master suite with a balcony and near new ensuite. The tightly held property enjoys a delightful garden outlook and benefits from ample visitor parking. This much loved address sits across from Thomson Park, steps to Artarmon Public School, the bus, village and station.

- Back of block position away from Reserve Road
- Private entry through a front courtyard, generous balconies
- Floorboards, bright and inviting