In [1]:
!sh config-llama3.sh

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /mnt/4TData/vuquang/.cache/huggingface/token
Login successful
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /mnt/4TData/vuquang/.netrc


In [2]:
import re
import pandas as pd
import torch
import numpy as np
from markdown import markdown
from bs4 import BeautifulSoup
from datasets import Dataset
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def generate_training_prompt(readme, summary, shots):
    if len(shots) == 0:
        return f"""### Instruction: Summarize the following README contents with LESS THAN 30 words. Your answer should be based on the provided README contents only.

        ### README contents:
        {readme.strip()}

        ### Summary:
        {summary}
        """.strip()
    else:
        prompt = """### Instruction: Summarize the following README contents with LESS THAN 30 words. Your answer should be based on the provided README contents only.
        ### For examples:
        """
        
        for i in range(len(shots)):
            prompt += f""" 
            ### README contents: 
            {shots[i]['readme'].strip()}
            
            ### Summary:
            {shots[i]['description'].strip()}            
            """

        prompt += f"""
        ### README contents:
        {readme.strip()}

        ### Summary:
        {summary}
        """.strip()
        return prompt

In [4]:
MODEL_NAME = "meta-llama/Meta-Llama-3-8B"

# You need to change this parameter according to your real path.
OUTPUT_DIR = "./llama3-8b_readme_summarization"
train_csv_file = '../dataset/train.csv'
val_csv_file = '../dataset/validation.csv'
test_csv_file = '../dataset/test.csv'

In [5]:
# Read data
train_df = pd.read_csv(train_csv_file, usecols=['readme', 'description'])
val_df = pd.read_csv(val_csv_file, usecols=['readme', 'description'])
test_df = pd.read_csv(test_csv_file, usecols=['readme', 'description'])

In [6]:
"""
    Return item and drop from frame. Raise KeyError if not found.
"""
def pop(df : pd.DataFrame, idx : int):
    readme = df['readme'][idx]
    description = df['description'][idx]
    result = {'readme' : readme, 'description' : description}
    df.at[idx, 'readme'] = np.nan
    df.at[idx, 'description'] = np.nan
    return result

# Function to remove tags
def format_entry(md_data) :
    html = markdown(md_data)
    # parse html content
    soup = BeautifulSoup(html, "html.parser")
    for a in soup.findAll('a', href=True):
        a.decompose()
    for data in soup(['style', 'script', 'img', 'pre', 'code']):
        # Remove tags
        data.decompose()
    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

def process_description(s):
    if s.endswith('.'):
        s = s[:-1]
        s = re.sub(r"\. ", ", ", s)
    return s + '.'

def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[^\s]+", "", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"#+", " ", text)
    return re.sub(r"\^[^ ]+", "", text)

In [7]:
for i, readme in enumerate(train_df['readme']):
    train_df.at[i, 'readme'] = format_entry(readme)

for i, readme in enumerate(val_df['readme']):
    val_df.at[i, 'readme'] = format_entry(readme)

for i, readme in enumerate(test_df['readme']):
    test_df.at[i, 'readme'] = format_entry(readme)

In [8]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [9]:
shots = []
num_of_shots = 0                            # {0, 1, 2, 3} shots

In [10]:
if num_of_shots == 0:
    pass
elif num_of_shots == 1:
    shots.append(pop(test_df, 8))
elif num_of_shots == 2:
    shots.append(pop(test_df, 8))
    shots.append(pop(test_df, 10))
elif num_of_shots == 3:
    shots.append(pop(test_df, 8))
    shots.append(pop(test_df, 10))
    shots.append(pop(test_df, 42))

In [11]:
def generate_sample_with_prompt(entry):
    readme = entry['readme']
    readme = clean_text(readme)
    description = process_description(entry['description'])
    return {
        "formatted_readme": readme,
        "summary": description,
        "prompt_text": generate_training_prompt(readme, description, shots),
    }

def process_dataset(data: Dataset):
    return data.shuffle(seed=42).map(generate_sample_with_prompt).remove_columns(
        [
            "readme",
            "description",
        ]
    )

In [12]:
processed_train_dataset = process_dataset(train_dataset)
processed_val_dataset = process_dataset(val_dataset)

Map: 100%|██████████| 5831/5831 [00:01<00:00, 4115.50 examples/s]
Map: 100%|██████████| 834/834 [00:00<00:00, 4317.80 examples/s]


In [13]:
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, 
    truncation=True
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
    
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    use_safetensors=True,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"
)
    
lora_r = 16
lora_alpha = 64
lora_dropout = 0.1
lora_target_modules = [
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj",
]


peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [02:31<00:00, 37.77s/it]


In [14]:
training_arguments = TrainingArguments(
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    logging_steps=100,
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=4,
    warmup_ratio=0.05,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    report_to="wandb",
    run_name="llama3-8b-readsum",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=42,
    push_to_hub=True
)

In [15]:
trainer = SFTTrainer(
    model=model,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_val_dataset,
    peft_config=peft_config,
    dataset_text_field="prompt_text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
)

Map: 100%|██████████| 5831/5831 [00:02<00:00, 2358.62 examples/s]
Map: 100%|██████████| 834/834 [00:00<00:00, 2623.02 examples/s]


In [16]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbunbohue1906[0m ([33mlocseo[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
0,1.9292,1.914178
2,0.9958,1.741249


In [None]:
trainer.save_model()

In [None]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/bunbohue/llama3-8b_readme_summarization/commit/2770da36791f84dcad17516f41523e77042859b2', commit_message='End of training', commit_description='', oid='2770da36791f84dcad17516f41523e77042859b2', pr_url=None, pr_revision=None, pr_num=None)