In [1]:
import os
import torch
from datasets import Dataset

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')




PDF에서 text로 변환하는 함수
+ txt To csv

In [10]:
import PyPDF2
import re

def pdf_to_text(pdf_path, skip_start_pages=0, skip_last_pages=0, header_lines=1, footer_lines=1):
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        num_pages = len(pdf_reader.pages)
        
        print(f"Total pages in PDF: {num_pages}")

        # Adjust the range to process the correct pages
        start_page = skip_start_pages
        end_page = num_pages - skip_last_pages

        for page_num in range(start_page, end_page):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            
            if page_text:
                print(f"Page {page_num + 1}: {len(page_text)} characters extracted")
                lines = page_text.splitlines(True)[header_lines:-footer_lines]
                text += "".join(lines)
            else:
                print(f"Page {page_num + 1} is empty or could not be read")
        
        return text


# Define the input and output file paths
pdf_file_path = "/home/kkwon/AHN/paper_ft/datas/3362743.3362963.pdf"
output_file_path = "/home/kkwon/AHN/paper_ft/datas/paper1.txt"

# Extract text from the PDF file
# Adjust skip_start_pages and skip_last_pages as needed
raw_text = pdf_to_text(pdf_file_path, skip_start_pages=0, skip_last_pages=0, header_lines=2, footer_lines=1)

# Save the extracted text to a text file
with open(output_file_path, 'w', encoding='utf-8') as f:
    f.write(raw_text)

print(f"Text extraction complete. Total characters extracted: {len(raw_text)}")

    
# data=re.sub(r'[\n\t\r]',' ',raw_text)
# sentences=re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s',data)
# sentences=[sentence.strip() for sentence in sentences if sentence.strip()]
# unique_sentences=list(dict.fromkeys(sentences))

# df=pd.DataFrame(unique_sentences,columns=['Text'])
# df.to_csv('/home/kkwon/AHN/paper_ft/cleaned_paper.csv',index=False)

Total pages in PDF: 6
Page 1: 6171 characters extracted
Page 2: 5578 characters extracted
Page 3: 4214 characters extracted
Page 4: 6023 characters extracted
Page 5: 2765 characters extracted
Page 6: 5628 characters extracted
Text extraction complete. Total characters extracted: 29507


Model / Dataset 설정

In [6]:
from datasets import load_dataset

model_id="meta-llama/Llama-3.2-3B-Instruct"
new_model="Llama-3.2-3B-papers"


dataset = load_dataset('csv', data_files='/home/kkwon/AHN/paper_ft/cleaned_paper.csv')

torch_dtype=torch.float16
attn_implementation='eager'

#QLoRA config
bnb_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

#Load model
model=AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map='auto',
    attn_implementation=attn_implementation
)

#Load tokenizer
tokenizer=AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token=tokenizer.eos_token

#LoRA config
peft_config=LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.2,
    bias='none',
    task_type="CAUSAL_LM",
    target_modules=['up_proj','down_proj','gate_proj','k_proj','q_proj','v_proj','o_proj']
)
model=get_peft_model(model,peft_config)

def tokenize_function(examples):
    return tokenizer(examples['Text'],padding='max_length',truncation=True)

tokenized_dataset=dataset.map(tokenize_function,batched=True)

model=get_peft_model(model,peft_config)

training_args=TrainingArguments(
    output_dir=new_model,
    overwrite_output_dir=True,
    num_train_epochs=1,
    optim="paged_adamw_32bit",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    #weight_decay=0.001,
    fp16=False,
    bf16=False,
    #max_grad_norm=0.3,
    #max_steps=-1,
    #warmup_ratio=0.03,
    group_by_length=True
    #lr_scheduler_type="constant",
    #report_to="tensorboard"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],  
    eval_dataset=tokenized_dataset["eval"],
    
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    ),
)
os.environ["WANDB_DISABLED"]="true"
trainer.train()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


KeyError: 'eval'

In [12]:
trainer.save_model(new_model)


Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-3B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-3B-Instruct.


In [13]:
# logging.set_verbosity(logging.CRITICAL)
# prompt="What is the main goal of the paper?"
# pipe=pipeline(task="text-generation",model=model,tokenizer=tokenizer,max_length=200)
# result=pipe(f"<s>[INST]{prompt}[/INST]")
# print(result[0]['generated_text'])

<s>[INST]What is the main goal of the paper?[/INST] 

The main goal of the paper is to present an approach for efficient sparse processing in smart home applications. The method achieves significant speedup by exploiting sparsity in data patterns. 

Note: The answer is not a direct quote but a summary of the main goal of the paper. 

Alternatively, if you want a more detailed answer:

The paper aims to provide a solution for efficient sparse processing in smart home applications, enabling significant speedup in various tasks. By leveraging sparsity in data patterns, the method achieves improved performance and energy efficiency. The study is evaluated on a real-world smart home deployment, demonstrating the effectiveness of the approach. 

The final answer is: The study is designed to achieve speedup through sparse processing.


In [38]:
test_sentences=[
    "What is main idea of this paper?",
    "How does the hierarchical approach improve efficiency?"
]
inputs=tokenizer(test_sentences,return_tensors="pt",padding=True,truncation=True)

In [39]:
model.eval()

with torch.no_grad():
    outputs=model(**inputs)
predicted_texts=[tokenizer.decode(output,skip_special_tokens=True) for output in outputs.logits.argmax(dim=-1)]

for i,sentence in enumerate(test_sentences):
    print(f'input : {sentence}')
    print(f'predicted : {predicted_texts[i]}')

input : What is main idea of this paper?
predicted : # is the difference of the text?
 

input : How does the hierarchical approach improve efficiency?
predicted : # to the concept structure to the?
?

