In [1]:
# !pip install pandas
# !pip install numpy
# !pip install transformers
# !pip install datasets
# !pip install vllm
# !pip install huggingface_hub
# !pip3 install torch torchvision torchaudio
# !pip install bitsandbytes
# !pip install trl
# !pip install autoawq
# !pip install peft
# !pip install wandb
# !pip install scikit-learn

In [2]:
import gc
import json
import pandas as pd
import vllm
import torch
import bitsandbytes as bnb
from trl import SFTTrainer
from awq import AutoAWQForCausalLM
from peft import LoraConfig, PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from datasets import Dataset, DatasetDict
from huggingface_hub import login
from sklearn.model_selection import train_test_split
from time import time

import wandb
import os
os.environ["WANDB_PROJECT"] = "ai-xploiter"  # name your W&B project

  from .autonotebook import tqdm as notebook_tqdm
2024-05-26 05:03:59,177	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [3]:

login("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")
base_model = 'mistralai/Mistral-7B-Instruct-v0.3'

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/bapt/.cache/huggingface/token
Login successful


In [4]:
df = pd.read_csv("./dataset_without_empty_measure.csv")

In [5]:
def preprocessing(data:pd.DataFrame, nb_mesures:int=2):
    tablatures = pd.DataFrame(columns = ["Key","Style","Tablatures1","Tablatures2"])
    for i in range(len(data)):
        tab = data.iloc[i]["Tablatures"]
        tab = tab.split("\n")
        tab = [x.split("|") for x in tab]
        
        new_tab1 = []
        new_tab2 = []

        for k in range((len(tab[0])-2)//nb_mesures):
            new_tablatures_1 = ""
            new_tablatures_2 = ""
            for j in range(len(tab)):
                line = tab[j][0]
                new_small_tab = []
                new_small_tab.append(tab[j][1+k*nb_mesures:1+(k+1)*nb_mesures])
                new_tab_text = (["|".join(x) for x in new_small_tab])
                new_tablatures_1 += line+'|'+new_tab_text[0]+'|\n'
                new_tablatures_2 += line+'|'+new_tab_text[0]+'|\n'
            new_tab1.append("[startt]"+new_tablatures_1+"[endt]")
            new_tab2.append("[startt]"+new_tablatures_2+"[endt]")

        for p in range(len(new_tab1) - 1):
            if i < len(data):  # Ensure index i is within the bounds of data
                row = {
                    "Key": data.iloc[i]["Key"],
                    "Style": data.iloc[i]["Style"],
                    "Original_Tablatures1": new_tab2[p],
                    "Tablatures1": new_tab1[p],
                    "Tablatures2": new_tab2[p + 1]
                }
                tablatures = pd.concat([tablatures, pd.DataFrame([row])], ignore_index=True)
            else:
                print(f"Index {p} is out of bounds for the data DataFrame.")
        
    return tablatures
df = preprocessing(df, nb_mesures=4)

In [6]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        if 'lm_head' in lora_module_names: # needed for 16-bit
            lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [7]:
from transformers import PreTrainedTokenizer

# Get tokenizer and configure padding
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Add special token for continuation
# new_special_tokens = ["<CONTINUE>"]
# tokenizer.add_special_tokens({'additional_special_tokens': new_special_tokens})



def generate_prompt(row, tokenizer:PreTrainedTokenizer, training: bool = False) -> str:
    messages = [
        {
            "role": "user",
            "content": f"""Give me the guitar four measures following this one in the key {row['Key']} and in the style {row['Style']}. Stop generating after four measures. Here is my guitar measures :
            
{row.get('Tablatures1', '')}"""
        }
    ]

    if training:
        messages += [
            {"role": "assistant", "content": row["Tablatures2"]}
        ]
    chat_template = """{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\\'t know the answer to a question, please don\\'t share false information.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + content.strip() }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"""
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=not training)#, chat_template=chat_template)

# Print an example
print("##")
print(generate_prompt(df.sample(n=1).to_dict(orient="records")[0], tokenizer=tokenizer, training=False))
print("##")
# Prepare prompts for SFT
df["prompt"] = df.apply(generate_prompt, tokenizer=tokenizer, training=True, axis=1)

train_df, temp_df = train_test_split(df, test_size=0.5, random_state=42)

# Then, split the remaining data into validation and test sets
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Create the DatasetDict
ds = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df)
})

##
<s>[INST] Give me the guitar four measures following this one in the key G and in the style Bossa Nova. Stop generating after four measures. Here is my guitar measures :
            
[startt]e|--------------|--------------|--------------|--------------|
B|--7-----------|--------------|--------------|--------------|
G|--------------|--7-----9-----|-----5--------|-----4--------|
D|--------------|--------------|--------------|--------------|
A|--------------|--------------|--------------|--------------|
E|--------------|--------------|--------------|--------------|
[endt] [/INST]
##


In [8]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.float16,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    ),
)
model.config.use_cache = False
model.resize_token_embeddings(len(tokenizer))


`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.47s/it]


Embedding(32768, 4096)

In [9]:


# LoRA configuration
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=find_all_linear_names(model),
)

# Define generic training configuration
training_args = TrainingArguments(
    per_device_train_batch_size=64,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": True},
    num_train_epochs=10,
    learning_rate=4e-5,
    lr_scheduler_type="cosine",
    optim="paged_adamw_32bit",
    warmup_steps=20,
    max_steps=200,
    output_dir=f"./wandb/test-{int(time())}",
    run_name = f"r64-length1024-batch128-{base_model}",
    logging_steps=1,
    evaluation_strategy="steps",
    eval_steps=10,
    per_device_eval_batch_size=16,
    eval_accumulation_steps=2,

    report_to="wandb",
    include_tokens_per_second=True,
    include_num_input_tokens_seen=True,

    fp16=True,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    peft_config=peft_config,

    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    dataset_text_field="prompt",

    max_seq_length=1024,

    # packing=True,
    args=training_args,
)


adapter_path = "./app/data/adapter"

with torch.backends.cuda.sdp_kernel(enable_flash=False):
    trainer.train()
trainer.save_model(adapter_path)

del model, peft_config, training_args, trainer
gc.collect()
torch.cuda.empty_cache()





Map: 100%|██████████| 1990/1990 [00:00<00:00, 10433.53 examples/s]
Map: 100%|██████████| 995/995 [00:00<00:00, 10677.61 examples/s]
max_steps is given, it will override any value given in num_train_epochs
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33myulin-shi[0m ([33mhackathon-ai-xploiter[0m). Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set t

Step,Training Loss,Validation Loss,Input Tokens Seen
10,1.6225,1.528547,228160
20,0.8647,0.811556,463296
30,0.5714,0.582265,699392
40,0.5481,0.55708,916470
50,0.5266,0.537168,1146230
60,0.5215,0.518503,1379766
70,0.5156,0.499381,1586088
80,0.4748,0.479498,1817512


In [None]:
# Quantize and save model
merge_path = "./app/data/merge"

model = AutoModelForCausalLM.from_pretrained(base_model)
model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(model, adapter_path)
model = model.merge_and_unload(progressbar=True, safe_merge=True)
model.save_pretrained(merge_path)
tokenizer.save_pretrained(merge_path)

Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.03it/s]
Unloading and merging model: 100%|██████████| 678/678 [00:14<00:00, 46.93it/s]


('./app/data/merge/tokenizer_config.json',
 './app/data/merge/special_tokens_map.json',
 './app/data/merge/tokenizer.model',
 './app/data/merge/added_tokens.json',
 './app/data/merge/tokenizer.json')

In [None]:
# del model
# gc.collect()

# model = AutoAWQForCausalLM.from_pretrained(merge_path)
# quant_config = {
#     "zero_point": True,
#     "q_group_size": 128,
#     "w_bit": 4, 
#     "version": "GEMM",
# }
# model.quantize(tokenizer, quant_config=quant_config)
# model.save_quantized(awq_path)
# tokenizer.save_pretrained(awq_path)


In [None]:
# del model
gc.collect()
torch.cuda.empty_cache()

# Inference 
merge_path = "./app/data/model1"

llm = vllm.LLM(
    model=merge_path,
    max_model_len=1028,
    # tensor_parallel_size=1,
    gpu_memory_utilization=.3,
    disable_log_stats=False,
)



INFO 05-26 04:48:12 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='./app/data/model1', speculative_config=None, tokenizer='./app/data/model1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1028, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=./app/data/model1)
INFO 05-26 04:48:16 model_runner.py:175] Loading model weights took 13.5005 GB
INFO 05-26 04:48:16 gpu_executor.py:114] # GPU blocks: 5038, # CPU blocks: 2048
INFO 05-26 04:48:18 model_runner.py:937] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enfor

In [None]:
prompts = test_df.apply(generate_prompt, tokenizer=tokenizer, training=False, axis=1)

sampling_params = vllm.SamplingParams(
    n=1,
    temperature=0.,
    top_p=0.95,
    max_tokens=256,
)
outputs = llm.generate(list(prompts), sampling_params)

Processed prompts:   0%|          | 0/996 [00:00<?, ?it/s]

INFO 05-26 04:49:08 metrics.py:334] Avg prompt throughput: 45.1 tokens/s, Avg generation throughput: 0.3 tokens/s, Running: 14 reqs, Swapped: 0 reqs, Pending: 982 reqs, GPU KV cache usage: 2.6%, CPU KV cache usage: 0.0%
INFO 05-26 04:49:13 metrics.py:334] Avg prompt throughput: 6537.6 tokens/s, Avg generation throughput: 5197.3 tokens/s, Running: 256 reqs, Swapped: 0 reqs, Pending: 740 reqs, GPU KV cache usage: 77.6%, CPU KV cache usage: 0.0%


Processed prompts:   1%|          | 7/996 [00:09<09:34,  1.72it/s]  

INFO 05-26 04:49:19 metrics.py:334] Avg prompt throughput: 48.6 tokens/s, Avg generation throughput: 5263.8 tokens/s, Running: 228 reqs, Swapped: 0 reqs, Pending: 761 reqs, GPU KV cache usage: 99.8%, CPU KV cache usage: 0.0%


Processed prompts:  23%|██▎       | 232/996 [00:14<00:13, 55.49it/s] 

INFO 05-26 04:49:24 metrics.py:334] Avg prompt throughput: 9137.0 tokens/s, Avg generation throughput: 4214.3 tokens/s, Running: 256 reqs, Swapped: 0 reqs, Pending: 505 reqs, GPU KV cache usage: 64.1%, CPU KV cache usage: 0.0%


Processed prompts:  26%|██▌       | 255/996 [00:17<00:42, 17.43it/s]

INFO 05-26 04:49:29 metrics.py:334] Avg prompt throughput: 565.3 tokens/s, Avg generation throughput: 5580.1 tokens/s, Running: 256 reqs, Swapped: 0 reqs, Pending: 484 reqs, GPU KV cache usage: 92.1%, CPU KV cache usage: 0.0%


Processed prompts:  28%|██▊       | 276/996 [00:24<01:41,  7.11it/s]

INFO 05-26 04:49:34 metrics.py:334] Avg prompt throughput: 2288.4 tokens/s, Avg generation throughput: 4767.6 tokens/s, Running: 30 reqs, Swapped: 0 reqs, Pending: 489 reqs, GPU KV cache usage: 11.7%, CPU KV cache usage: 0.0%


Processed prompts:  49%|████▉     | 491/996 [00:29<00:18, 26.78it/s] 

INFO 05-26 04:49:39 metrics.py:334] Avg prompt throughput: 6697.9 tokens/s, Avg generation throughput: 4745.5 tokens/s, Running: 256 reqs, Swapped: 0 reqs, Pending: 248 reqs, GPU KV cache usage: 77.7%, CPU KV cache usage: 0.0%


Processed prompts:  52%|█████▏    | 518/996 [00:35<01:41,  4.73it/s]

INFO 05-26 04:49:44 metrics.py:334] Avg prompt throughput: 607.0 tokens/s, Avg generation throughput: 4977.6 tokens/s, Running: 244 reqs, Swapped: 0 reqs, Pending: 234 reqs, GPU KV cache usage: 99.9%, CPU KV cache usage: 0.0%


Processed prompts:  74%|███████▎  | 734/996 [00:39<00:02, 114.22it/s]

INFO 05-26 04:49:49 metrics.py:334] Avg prompt throughput: 8130.3 tokens/s, Avg generation throughput: 4407.8 tokens/s, Running: 256 reqs, Swapped: 0 reqs, Pending: 1 reqs, GPU KV cache usage: 59.6%, CPU KV cache usage: 0.0%


Processed prompts:  75%|███████▌  | 750/996 [00:44<00:13, 18.49it/s] 

INFO 05-26 04:49:54 metrics.py:334] Avg prompt throughput: 27.1 tokens/s, Avg generation throughput: 5637.9 tokens/s, Running: 244 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 88.8%, CPU KV cache usage: 0.0%


Processed prompts:  80%|███████▉  | 792/996 [00:50<00:22,  8.95it/s]

INFO 05-26 04:49:59 metrics.py:334] Avg prompt throughput: 865.7 tokens/s, Avg generation throughput: 4909.3 tokens/s, Running: 7 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 3.3%, CPU KV cache usage: 0.0%


Processed prompts: 100%|██████████| 996/996 [00:51<00:00, 19.44it/s] 


In [None]:
for i in range(5):
    test_df[f"output_{i}"] = [output.outputs[i].text.strip() if i < len(output.outputs) else None for output in outputs]


In [None]:
test_df

Unnamed: 0,Key,Style,Tablatures1,Tablatures2,Original_Tablatures1,prompt,output_0,output_1,output_2,output_3,output_4
3641,C#,Singer-Songwriter,e|--5--4--------|-----6--------|-----2--------...,e|--2-----------|--5--6--------|--------------...,e|--5--4--------|-----6--------|-----2--------...,<s>[INST] Give me the guitar four measures fol...,e|-----1--------|-----2--------|-----2--------...,,,,
3477,F#,Jazz,e|--------------|--------------|--------------...,e|--------------|--------------|--------------...,e|--------------|--------------|--------------...,<s>[INST] Give me the guitar four measures fol...,e|--------------|--------------|--------------...,,,,
3758,A,Rock,e|--------------|--------------|--------------...,e|--------------|--------------|--------------...,e|--------------|--------------|--------------...,<s>[INST] Give me the guitar four measures fol...,e|--------------|--------------|--------------...,,,,
905,C#,Funk,e|-----6--6-----|-----6--------|-----6--8-----...,e|-----8---------|----------------|--8--------...,e|-----6--6-----|-----6--------|-----6--8-----...,<s>[INST] Give me the guitar four measures fol...,e|-----8--8-----|-----8--------|-----8--8-----...,,,,
2288,Bb,Jazz,e|--------------|--------------|--------------...,e|--------------|--------------|--------------...,e|--------------|--------------|--------------...,<s>[INST] Give me the guitar four measures fol...,e|--------------|--------------|--------------...,,,,
...,...,...,...,...,...,...,...,...,...,...,...
2972,F#,Jazz,e|--------------|--------------|--------------...,e|--------------|--------------|--------------...,e|--------------|--------------|--------------...,<s>[INST] Give me the guitar four measures fol...,e|--------------|--------------|--------------...,,,,
2614,Bb,Singer-Songwriter,e|--------------|----------------|------------...,e|---------------|----------------|-----------...,e|--------------|----------------|------------...,<s>[INST] Give me the guitar four measures fol...,e|--------------|----------------|------------...,,,,
1042,Ab,Funk,e|--4-----------|--------------|--------------...,e|--------------|--4-----------|--4-----4-----...,e|--4-----------|--------------|--------------...,<s>[INST] Give me the guitar four measures fol...,e|--------------|--------------|--------------...,,,,
2202,E,Singer-Songwriter,e|-----0--------|--0--0--------|-----0--0-----...,e|-----0--0-----|-----0--0-----|--------0-----...,e|-----0--------|--0--0--------|-----0--0-----...,<s>[INST] Give me the guitar four measures fol...,e|-----0--0-----|--0--0--0-----|--0--0--0-----...,,,,


In [None]:
i=12

print(test_df["Original_Tablatures1"].iloc[i])
print("=====")
print(test_df["output_0"].iloc[i])

e|--------------|----------------|---------------|-----------------|
B|--6--6--------|-----9---9------|-----9---------|--9---9---9------|
G|--6--6--------|-----10--10-----|-----10--------|--10--10--10-----|
D|--8--8--------|-----10--10-----|---------------|--10--10--10-----|
A|--------------|----------------|---------------|-----------------|
E|--------------|----------------|---------------|-----------------|

=====
e|----------------|----------------|----------------|----------------|
B|--9--9--9--9-----|--9--9--9--9-----|--9--9--9--9-----|--9--9--9--9-----|
G|--10--10--10-----|--10--10--10-----|--10--10--10-----|--10--10--10-----|
D|--10--10--10-----|--10--10--10-----|--10--10--10-----|--10--10--10-----|
A|-----------------|-----------------|-----------------|-----------------|
E|-----------------|-----------------|-----------------|-----------------|
 ┆e|----------------|----------------|----------------|----------------|
B|--9--9--9--9-----|--9--9--9--9-----|--9--9--9--9-----|--9-

In [None]:
# !python -m vllm.entrypoints.openai.api_server --model ./app/data/merge --dtype auto --max-model-len 1536

In [None]:
def postprocess_output(input:str, output: str) -> str:
    # find the length of measures
    inputs = input.split("\n")
    inputs = [x.split("|") for x in inputs]
    length_measures = len(inputs[0][1])

    output = output.split("\n")
    output_corrected = input.split("\n")

    for i in range(len(output)//6):
        # check 
        for j in range(6):
            if output[i*6+j][0] != inputs[j][0]:

                return output_corrected
            measures = output[i*6+j].split("|")
            for k in range(1,len(measures)-1):
                if len(measures[k]) != length_measures:
                    return output_corrected
            if len(measures[-1]) != 0:
                return output_corrected
        # add the measures
        for j in range(6):
            output_corrected[j] += output[i*6+j][2:]
    return output_corrected


ind = 0
postprocess_output(input = test_df["Original_Tablatures1"].iloc[ind], output=test_df["output_0"].iloc[ind])

['e|--5--4--------|-----6--------|-----2--------|-----1--------|-----1--------|-----2--------|-----2--------|-----2--------|',
 'B|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|',
 'G|--------------|-----6--------|-----2--------|-----2--------|-----2--------|-----2--------|-----2--------|-----2--------|',
 'D|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|',
 'A|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|',
 'E|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|',
 '']