In [19]:
import os
import torch
from transformers import ( 
    AutoTokenizer, 
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    )
from pprint import PrettyPrinter
from peft import LoraConfig, PeftModel
import pandas as pd
import psutil
from datetime import timedelta
from tqdm import tqdm
from generation_vis import extract_event_info

In [None]:
from huggingface_hub import login
import os
from dotenv import load_dotenv

current_dir = os.getcwd()  
dotenv_path = os.path.join(current_dir, 'src', '.env')  
load_dotenv(dotenv_path) 
login(token=os.getenv("HF_TOKEN"))

In [None]:
def memory_usage():
    process = psutil.Process()
    mem_info = process.memory_info()
    print(f"RSS: {mem_info.rss / (1024 ** 2):.2f} MB")  # Resident Set Size
    print(f"VMS: {mem_info.vms / (1024 ** 2):.2f} MB")  # Virtual Memory Size
    print(f"VMS: {mem_info.vms / (1024 ** 2):.2f} MB")  # Virtual Memory Size

memory_usage()

In [None]:
path_to_parquet = "../data/ft_data/"
data_files = (file for file in os.listdir(path_to_parquet) if file.endswith('parquet'))
counter = 0
pd_dataset = None
for file in data_files:
    if counter == 0:
        pd_dataset = pd.read_parquet(path_to_parquet + file, engine='pyarrow')
    else:
        df = pd.read_parquet(path_to_parquet + file, engine='pyarrow')
        pd_dataset = pd.concat([pd_dataset, df])
    df = None
    counter += 1
    break
memory_usage()
pd_dataset = pd_dataset.sample(frac=1, random_state=42)

In [6]:
model_name = "mistralai/Mistral-7B-v0.3"
new_model = "Mistral-7B-sim-qlora"
adapter_path = "../training/adapter"

In [None]:
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

device_map = {"":0}

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit = use_4bit,
    bnb_4bit_quant_type = bnb_4bit_quant_type,
    bnb_4bit_compute_dtype = compute_dtype,
    bnb_4bit_use_double_quant = use_nested_quant,)

#cheking GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("="*80)
        print("Your GPU supports bfloat16, you are getting accelerate training with bf16= True")
        print("="*80)

#load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = device_map,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code = True)

model = PeftModel.from_pretrained(model, adapter_path)

In [None]:
counter = 0 
input_text = """This is a sequence of football match events.
Time: 00:00:00.000 | Event: Half Start
"""
seq_list = input_text.split('\n')
match_list = []
for i in tqdm(range(100)):
    input_text = '\n'.join(seq_list)
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")  # Ensure tensors are on the right device
    output = model.generate(**inputs, max_new_tokens=100, do_sample=True ,temperature=1, pad_token_id=tokenizer.eos_token_id)
    
    prediction = tokenizer.decode(output[0], skip_special_tokens=True).split('\n')[-2]
    seq_list.append(prediction)
    if len(seq_list) > 10:
        seq_list.pop(1)
    match_list.append(prediction)
    counter +=1


# Save Data
- First save the whole sequence as raw txt file

In [None]:
start_events = ["This is a sequence of football match events.", "Time: 00:00:00.000 | Event: Half Start"]
simulation = start_events + match_list

with open('generated_simulation.txt', 'w') as f:
    for line in simulation:
        f.write(f"{line}\n")

- Then save the version readable for the simulation visualization script 

In [3]:
events_list = []
with open('generated_simulation.txt', 'r') as f:
    for event in f:
        if event.startswith("This"):
            continue
        else:
            event_info = extract_event_info(event)
            event_info['event'] = event_info['event'].strip()
            event_info['time'] = str(event_info['time'])
            event_info["text"] = event
            event_info["start_loc"] = (event_info["start_x"], event_info["start_y"])
            event_info["end_loc"] = (event_info["end_x"], event_info["end_y"])
            events_list.append(event_info)

events_df =pd.DataFrame(events_list)
events_df.to_csv("generated_events.csv", index=False)