In [None]:
import json
from tqdm import tqdm
# standard library imports
import random
import json
import os
# third party imports
from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import SFTTrainer
from transformers import TrainingArguments
from huggingface_hub import login
from datasets import Dataset

import pandas as pd
import numpy as np
import torch


In [None]:
max_seq_length = 1024
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
#load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.



model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # 4x longer contexts auto supported!
    random_state = 7723,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
## Load the data

path = os.path.expanduser('~/OneDrive/LLM//Llama3-70B-full-output.json')

with open(path, 'r') as f:
    llama_results = json.load(f)
# llama_results=llama_results[5:10]
print(len(llama_results))
print(llama_results[5:10])

In [None]:
# get the instructions prompt
path = os.path.expanduser('/home/yw30f/OneDrive/LLM/code/UBDC_proximity/UBDC_proximity/prompts/instruction_template.txt')

with open(path, 'r') as f:
    instruction= f.read()
print(instruction)

In [None]:
instruction = instruction.replace('<location>', 'Edinburgh, UK')
instruction

In [None]:
# clean the data to remove any miscinstructions etc

In [None]:
data = []

# Ensure llama_results is iterable
if isinstance(llama_results, dict):
    llama_results = [llama_results]

for result in tqdm(llama_results):
    out = {
        "instruction": instruction,
        "input": result.get("description", "")
    }

    nearby = result.get("nearby", {})

    # üß† If nearby is a string, try to parse it
    if isinstance(nearby, str):
        try:
            nearby = json.loads(nearby)
        except json.JSONDecodeError:
            # If parsing fails, treat as empty
            nearby = {}

    # 1Ô∏è‚É£ Handle missing or empty nearby entries
    if not isinstance(nearby, dict) or all(len(v) == 0 for v in nearby.values()):
        out["response"] = json.dumps([
            {"specific_locations": [], "general_references": []}
        ])
        data.append(out)
        continue

    # 2Ô∏è‚É£ Extract lists safely
    specific = nearby.get("specific_locations", [])
    general = nearby.get("general_locations", [])
    parent  = nearby.get("parent_locations", [])

    # Combine general + parent ‚Üí general_references
    general_refs = list(set(general + parent))  # optional deduplication

    # 3Ô∏è‚É£ Create structured response
    structured_response = [
        {
            "specific_locations": specific,
            "general_references": general_refs,
            "parent_references": parent
        }
    ]

    # 4Ô∏è‚É£ Serialize to JSON string
    out["response"] = json.dumps(structured_response)
    data.append(out)

# ‚úÖ Example check
print(data[0:5])
print(len(data))

In [None]:

# Set up the prompt template for model training

ft_prompt = """Below is an instruction that describes a task, paired with an input that provides a specfic example which the task should be applied to. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}
"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["response"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = ft_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

ft_data = {"items":data}
print(ft_data['items'][0])

In [None]:
# Set up the test/train split
random.seed(7723)
trn_idxs = random.sample(range(len(data)), 6500) #2200
val_idxs = [x for x in range(len(data)) if x not in trn_idxs]
trn_data = [data[i] for i in trn_idxs]
val_data = [data[i] for i in val_idxs]

trn_dataset = Dataset.from_pandas(pd.DataFrame(trn_data), split="train")
print(trn_dataset.shape)

In [None]:
# format data for input into the model
trn_dataset = trn_dataset.map(formatting_prompts_func, batched = True,)

In [None]:
## Training

In [None]:

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = trn_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        num_train_epochs = 3,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
def clear_gpu_memory():
    import torch, gc, os
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    print("GPU memory cleared.")

clear_gpu_memory()

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
print(model.device)


In [None]:
def recursively_fix_device(module, device):
    if not hasattr(module, "device") or module.device is None:
        module.device = device
    for child in module.children():
        recursively_fix_device(child, device)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
recursively_fix_device(model, device)

bad = [n for n, m in model.named_modules() if getattr(m, "device", None) is None]
print("Modules without device:", bad)

In [None]:
# trian the model
trainer_stats = trainer.train()

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
with open('/home/yw30f/OneDrive/LLM/code/UBDC_proximity/UBDC_proximity/huggingface_key.txt', 'r') as f:
    hf_key = f.read()
print(hf_key)
# log in to huggingface so the model can be saved
login(hf_key)

In [None]:
# save to huggingface
model.push_to_hub("ywang-gla/ProxiLlama-3.1-8b_trn6500")

In [None]:
# standard library imports
from argparse import ArgumentParser
import json
import sys
import logging
logger = logging.getLogger(__name__)

# third party imports
import pandas as pd
from huggingface_hub import login
from unsloth import FastLanguageModel
from tqdm import tqdm

sys.path.append("/home/yw30f/OneDrive/LLM/code/UBDC_proximity/UBDC_proximity/scripts")

from json_utils import validate_json, get_schema #extract_json, 


class LLM:
    
    def __init__(self, 
                 model_name:str, 
                 prompt_path:str, 
                 instruct_path:str, 
                 json_fix_path:str, 
                 hf_key_path:str):
        
        self.model, self.tokenizer = self.load_model(model_name)
        
        self.template = self.load_text(prompt_path)
        self.instruction = self.load_text(instruct_path)
        self.json_fix_instruction = self.load_text(json_fix_path)
        
        self.hf_key = self.load_text(hf_key_path)
        try:
            login(self.hf_key)
        except:
            raise KeyError('Please ensure HuggingFace key is valid')
        
    def load_model(self, model_name:str):
        """Loads the specified modle from Huggingface. Please use a model
        hosted on the Unsloth page.
        
        args:
            model_name (str) : path to model on huggingface e.g. 
            "unsloth/Llama-3.2-3B-Instruct"
        returns:
            FastLanguageModel : unsloth hosted model.
            Tokenizer : corresponding tokenizer for model.
        """
        model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_name,
        max_seq_length = 512,
        dtype = None,
        load_in_4bit = True)
        model = FastLanguageModel.for_inference(model)
        return model, tokenizer
    
    def load_text(self, path:str):
        """Loads data from a txt file
        
        args:
            path (str) : path to the text data.
        returns:
            str : the text.
        """
        with open(path, 'r') as f:
            text = f.read()
        return text
        
    def get_model_response(self, text:str, location:str, max_tokens=512, max_retries=2, curr_retries=0):
        """Passes the text to the LLM and returns a JSON
        args:
            text (str) : text to be processed.
            location (str) : location relavent to text (e.g. London)
            max_new_tokens (int) : max output size for model.
            max_retries (int) : max number of times to retry if json is broken
            curr_retries (int) : current number of retries performed.  
        returns
            Json formatted list[dict[str, str]]
        """
        prompt = self.template.format(self.instruction, text, "")
        prompt = prompt.replace('<location>', location)
        inputs = self.tokenizer([prompt], return_tensors = "pt").to("cuda")
        response = self.model.generate(**inputs, max_new_tokens = max_tokens)
        output = self.tokenizer.decode(response[0])
        # process output and check results
        processed = self.process_output(output)
        # if retries exceeded, reset and return result
        #sys.stdout.write(f'Current Retries = {curr_retries}')
        if curr_retries > max_retries:
            logger.info('Max Retries Exceeded! Attempting LLM fix.')
            processed = self.llm_json_fix(output)
            logger.info(processed)
            curr_retries=0
            return processed
        # if an empty list, return as normal
        elif len(processed)==0:
            curr_retries = 0
            return processed
        # if ['invalid json'] or ['misconstructed json'] retry
        elif isinstance(processed[0], str):
            curr_retries += 1
            max_tokens += 256
            logger.info(f'Current Retries = {curr_retries}')
            logger.info(f'Max Tokens = {max_tokens}')
            logger.info(processed[0])
            
            
            processed = self.get_model_response(text, location, max_tokens, max_retries, curr_retries)
            return processed
        # otherwise we will be ok to return the processed output
        else:
            curr_retries = 0
            return processed
        
    def process_output(self, output:str)->list[dict[str,str]]:
        """Takes a string output from the LLM, extracts the relavent JSON and 
        processes it with reference to the schema defined in 'json_utils'
        
        args: 
            output (str) : a string containing a (possibly misconstructed) json.
        
        returns:
            list[dict[str,str]] : A valid json in accordance with the shema. 
        
        notes:
            Returns ['misconstructed json'] or ['invalid json'] if the json 
            could not be constructed or validated, respectively.
        """
        # get the response part of the output
        response = output.split('### Response:')[1:][0]
        # load schema
        schema=get_schema()
        # extract and validate
        try:
            json_out = extract_json(response)
        except:
            return ['misconstructed json']
        try:
            valid_json_out = validate_json(json_out, schema) 
        except:
            return ['invalid json']
        return valid_json_out
    
    def llm_json_fix(self, output):
        """As a last resort, we can ask the LLM to fix a JSON it has produced. 
        """
        json_str = output.split('### Response:')[1:][0]
            
        prompt = self.template.format(self.json_fix_instruction, json_str, '')
        inputs = self.tokenizer([prompt], return_tensors = "pt").to("cuda")
        response = self.model.generate(**inputs, max_new_tokens = 1024)
        output = self.tokenizer.decode(response[0])
        # process output and check results
        processed = self.process_output(output)
        return processed        
        


In [None]:
def clear_gpu_memory():
    import torch, gc, os
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    print("GPU memory cleared.")

clear_gpu_memory()


In [None]:
model = LLM(model_name='ywang-gla/ProxiLlama-3.1-8b_trn6500',
            prompt_path='/home/yw30f/OneDrive/LLM/code/UBDC_proximity/UBDC_proximity/prompts/prompt_template.txt',
            instruct_path='/home/yw30f/OneDrive/LLM/code/UBDC_proximity/UBDC_proximity/prompts/instruction_template.txt',
            json_fix_path='/home/yw30f/OneDrive/LLM/code/UBDC_proximity/UBDC_proximity/prompts/json_fix_instruction.txt',
            hf_key_path='/home/yw30f/OneDrive/LLM/code/UBDC_proximity/UBDC_proximity/huggingface_key.txt')
            


In [None]:
from tqdm import tqdm

outputs = []
for d in tqdm(val_data[0:10]):
  
  tst_outputs = model.get_model_response(text=d['input'],
                                         location='Edinburgh, UK',
                                         max_tokens=512, 
                                         max_retries=2)
  outputs.append(tst_outputs)

In [None]:
# Save (dump) to a JSON file
with open(r'/home/yw30f/OneDrive/LLM/code/UBDC_proximity/UBDC_proximity/results/test_validation_output_from8b.json', 'w', encoding='utf-8') as f:
    json.dump(outputs, f, indent=2, ensure_ascii=False)

In [None]:
i=1
val_data[i]['input']

In [None]:
outputs[1]

In [None]:
# try on test set
test_df = pd.read_csv('large_test_set.csv')

In [None]:
outputs = []
for i, row in tqdm(test_df.iterrows(), total=100):
  text = row['description']
  loc = row['location']
  tst_outputs = model.get_model_response(text=text,
                                         location=loc,
                                         max_tokens=512, 
                                         max_retries=2)
  outputs.append(tst_outputs)

In [None]:
i = 0


print(test_df.iloc[i].description)

In [None]:
outputs[i]

In [None]:
import json
import re
import pandas as pd
from ast import literal_eval
def extract_locations_from_json(json_path, output_csv="extracted_locations.csv"):
    """
    Extract specific_locations, general_locations, and parent_locations from
    the 'nearby' field of model-generated JSON objects.

    Handles multiple field naming variants and delimiters.
    """
    
    # Load JSON
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    rows = []
    for entry in data:
        key = entry.get("key")
        desc = entry.get("description", "")
        nearby = entry.get("nearby", "")

        # --- Step 1. Find the block after 'Response:' and before 'Reasoning' or '<|eot_id|>'
        match = re.search(
            r'(?:Response:|### Response:|Here is the output in JSON format:)(.*?)(?:Reasoning|<\|eot_id\|>|$)',
            nearby,
            flags=re.S | re.I
        )

        if not match:
            rows.append({
                "key": key,
                "description": desc,
                "specific_locations": [],
                "general_locations": [],
                "parent_locations": []
            })
            continue

        block = match.group(1)

        # --- Step 2. Clean up the block to resemble JSON
        block = block.replace("<br />", "").replace("\n", " ").replace("->", ":")
        block = re.sub(r'"""|‚Äú|‚Äù', '"', block)
        block = re.sub(r'(\b(references|locations)\b)', r'\1', block)

        # Normalize field names
        replacements = {
            "specific_references": "specific_locations",
            "general_references": "general_locations",
            "parent_references": "parent_locations"
        }
        for old, new in replacements.items():
            block = block.replace(old, new)

        # --- Step 3. Extract lists using regex (fallback if block isn't valid JSON)
        def extract_list(field):
            pattern = rf'"{field}"\s*:\s*\[([^\]]*)\]'
            found = re.search(pattern, block)
            if not found:
                return []
            try:
                return [i.strip().strip('"\' ') for i in found.group(1).split(",") if i.strip()]
            except Exception:
                return []

        specific = extract_list("specific_locations")
        general = extract_list("general_locations")
        parent = extract_list("parent_locations")

        # --- Step 4. Save results
        rows.append({
            "key": key,
            "description": desc,
            "specific_locations": specific,
            "general_locations": general,
            "parent_locations": parent
        })

    # --- Step 5. Export to CSV
    df = pd.DataFrame(rows)
    df.to_csv(output_csv, index=False)
    print(f"‚úÖ Extracted {len(df)} entries to {output_csv}")
    return df

extract_locations_from_json(
    "/home/yw30f/OneDrive/LLM/code/UBDC_proximity/UBDC_proximity/results/llama8b_description_homeNoTraining_output_atmp1.json",
    "/home/yw30f/OneDrive/LLM/code/UBDC_proximity/UBDC_proximity/results/llama8b_description_roomNoTraining_output_atmp1.csv")