In [9]:
import torch
import re
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, BitsAndBytesConfig
from peft import PeftModel
from markdown import markdown
from bs4 import BeautifulSoup

In [11]:
def generate_testing_prompt(readme, shots):
    if len(shots) == 0:
        return f"""### Instruction: You are a helpful assistant. You need to summarize the following README contents. A good answer should be based on the provided README contents only and LESS THAN 20 words.

        ### README contents:
        {readme.strip()}

        ### Summary:
        """.strip()
    else:
        prompt = """### Instruction: You are a helpful assistant. You need to summarize the following README contents. A good answer should be based on the provided README contents only and LESS THAN 20 words.
        ### For examples:
        """
        
        for i in range(len(shots)):
            prompt += f""" 
            ### README contents: 
            {shots[i]['readme'].strip()}
            
            ### Summary:
            {shots[i]['description'].strip()}            
            """

        prompt += f"""
        ### README contents:
        {readme.strip()}

        ### Summary:
        """.strip()
        return prompt

def format_entry(md_data) :
    html = markdown(md_data)
    # parse html content
    soup = BeautifulSoup(html, "html.parser")
    for a in soup.findAll('a', href=True):
        a.decompose()
    for data in soup(['style', 'script', 'img', 'pre', 'code']):
        # Remove tags
        data.decompose()
    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[^\s]+", "", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"#+", " ", text)
    return re.sub(r"\^[^ ]+", "", text)

def process_description(s: str) -> str:
    if s.endswith('.'):
        s = s[:-1]
        s = re.sub(r"\. ", ", ", s)
    return s + '.'

In [34]:
device = "cuda:0"
checkpoint = "bunbohue/bart-large-xsum_readme_summarization"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
model = model.to(device)

In [35]:
readme = """
b'KNOWLEDGE\n=========\nStudying goals, curated lists of awesome information, life achievements and more. \n\nMotivation\n----------\n"If you cant even clean up your own room, who the hell are you to give advice to the world?" - Jordan Peterson\n\nSoftware Engineering\n====================\n\n\nData Structures\n---------------\n  Linked List\n  Queue\n  Stack\n  Graph\n  Hash Map\n  Tree Map\n  Hash Set\n  Tree\n  Array\n  AVL Tree\n  Red Black Tree\n  LRU Cache\n  Bloom Filter\n\nDesign Patterns\n---------------\n  Singleton\n  Factory\n  Strategy\n  Observer\n  Builder\n  Adapter\n  State\n  Bridge\n  Visitor\n  Abstract Factory\n\nAlgorithms\n----------\n  Bubble Sort\n  Insertion Sort\n  Selection Sort\n  Quick Sort\n  Merge Sort\n\nPrinciples\n----------\n\n\nLanguages\n---------\n\n  C++\n\n\n\n\n  Java\n  Kotlin\n\n\n  Python\n  Javascript\n\n\n\n\n  Go\n\n\n  Functional Programming\n  Haskell\n  Scala\n  Elm\n\nShaders\n-------\n\n\nDevOps\n======\n  Kubernetes\n  Jenkins\n  Docker\n\nCrypto\n======\n  Bitcoin\n  Etherium\n\n\n  Smart Contracts\n\n\nMachine Learning\n================\n\nMathematics\n===========\n\nHardware\n========\n\nMicrocontrollers\n----------------\n  Raspberry PI\n  Arduino\n  ESP8266\n\nGraphic Design\n=================\n  Illustrator\n  Photoshop\n  Figma\n\nLearning Techniques\n===================\n  Pomodoro Technique\n1. Choose a task to be accomplished.\n2. Set the Pomodoro to 25 minutes (the Pomodoro is the timer).\n3. Work on the task until the Pomodoro rings, then put a check on your sheet of paper.\n4. Take a short break (5 minutes is OK).\n5. Every 4 Pomodoros take a longer break.\n'
"""

In [36]:
prefix = "summarize: "
inputs = tokenizer(prefix + readme, return_tensors="pt", truncation=True).input_ids.to(device)
outputs = model.generate(inputs, max_new_tokens=128, do_sample=False)
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(prediction)

Studying goals, curated lists of awesome information, life achievements and more. 


## Inference with LLM (Llama 2)

In [27]:
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
DEVICE = torch.device("cuda:0")
OUTPUT_DIR = "./zero-shot-prompting-llama-2-7b_readsum_29-6-2024"

In [28]:
test_df = pd.read_csv('../dataset/updated_test.csv', usecols=['readme', 'description'])

In [29]:
shots = []
num_of_shots = 0

if num_of_shots == 0:
    pass
elif num_of_shots == 1:
    shots.append(pop(test_df, 8))
elif num_of_shots == 2:
    shots.append(pop(test_df, 8))
    shots.append(pop(test_df, 10))

In [30]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
    
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, 
    truncation=True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
    
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    use_safetensors=True,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"
)
    
model = PeftModel.from_pretrained(model, OUTPUT_DIR)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]


In [32]:
readme = """b'KNOWLEDGE\n=========\nStudying goals, curated lists of awesome information, life achievements and more. \n\nMotivation\n----------\n"If you cant even clean up your own room, who the hell are you to give advice to the world?" - Jordan Peterson\n\nSoftware Engineering\n====================\n\n\nData Structures\n---------------\n  Linked List\n  Queue\n  Stack\n  Graph\n  Hash Map\n  Tree Map\n  Hash Set\n  Tree\n  Array\n  AVL Tree\n  Red Black Tree\n  LRU Cache\n  Bloom Filter\n\nDesign Patterns\n---------------\n  Singleton\n  Factory\n  Strategy\n  Observer\n  Builder\n  Adapter\n  State\n  Bridge\n  Visitor\n  Abstract Factory\n\nAlgorithms\n----------\n  Bubble Sort\n  Insertion Sort\n  Selection Sort\n  Quick Sort\n  Merge Sort\n\nPrinciples\n----------\n\n\nLanguages\n---------\n\n  C++\n\n\n\n\n  Java\n  Kotlin\n\n\n  Python\n  Javascript\n\n\n\n\n  Go\n\n\n  Functional Programming\n  Haskell\n  Scala\n  Elm\n\nShaders\n-------\n\n\nDevOps\n======\n  Kubernetes\n  Jenkins\n  Docker\n\nCrypto\n======\n  Bitcoin\n  Etherium\n\n\n  Smart Contracts\n\n\nMachine Learning\n================\n\nMathematics\n===========\n\nHardware\n========\n\nMicrocontrollers\n----------------\n  Raspberry PI\n  Arduino\n  ESP8266\n\nGraphic Design\n=================\n  Illustrator\n  Photoshop\n  Figma\n\nLearning Techniques\n===================\n  Pomodoro Technique\n1. Choose a task to be accomplished.\n2. Set the Pomodoro to 25 minutes (the Pomodoro is the timer).\n3. Work on the task until the Pomodoro rings, then put a check on your sheet of paper.\n4. Take a short break (5 minutes is OK).\n5. Every 4 Pomodoros take a longer break.\n'"""
prompt = generate_testing_prompt(readme, shots)

In [33]:
inputs = tokenizer(prompt, max_length=4096, truncation=True, return_tensors="pt").to(DEVICE)
inputs_length = len(inputs["input_ids"][0])
with torch.inference_mode():
    outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.0001)
prediction = tokenizer.decode(outputs[0][inputs_length:], skip_special_tokens=True)
prediction

'\n        Studying goals, curated lists of awesome information, life achievements and more. . 📚 . 🎓 . 📖 . 🎮 . 🎨 . 📱 . 💻 . 📦 . 📕 . 📑 . 📓 . 📙 . 📜 . �'