# Final Project of the NLP 2024 Course

Slides: https://docs.google.com/presentation/d/1NbH4E2HKVHQlaW_ivKCyjpWuEJFvmz3bSKsX8fs67tA/edit#slide=id.g2d17364e0e4_0_34


## Environment Setup

Get your own huggingface access token via
https://huggingface.co/settings/tokens

And set up HF_TOKEN as a secret of Colab

In [1]:
!pip install transformers accelerate

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

## Using the pre-trained model

In [2]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from google.colab import userdata

driveDir = '/content/drive/MyDrive/1122自然語言處理/自然語言處理期末報告/Model/fine-tuned-model'



tokenizer = AutoTokenizer.from_pretrained(driveDir, trust_remote_code=True, token=userdata.get('HF_TOKEN'))
model = AutoModelForCausalLM.from_pretrained(driveDir, trust_remote_code=True)

In [4]:
if isinstance(model, str):
  print(111)
else:
  print(222)

if isinstance(tokenizer, str):
  print(111)
else:
  print(222)

222
222


In [5]:
"""Module to generate OpenELM output given a model and an input prompt."""
import os
import logging
import time
import argparse
from typing import Optional, Union
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM

from google.colab import userdata


# The following function is revised from https://huggingface.co/apple/OpenELM/blob/main/generate_openelm.py
def generate(
    prompt: str,
    model: Union[str, AutoModelForCausalLM],
    hf_access_token: str = None,
    tokenizer: Union[str, AutoTokenizer] = 'meta-llama/Llama-2-7b-hf',
    device: Optional[str] = None,
    max_length: int = 1024,
    assistant_model: Optional[Union[str, AutoModelForCausalLM]] = None,
    generate_kwargs: Optional[dict] = None,
) -> str:
    """ Generates output given a prompt.
    Args:
        prompt: The string prompt.
        model: The LLM Model. If a string is passed, it should be the path to
            the hf converted checkpoint.
        hf_access_token: Hugging face access token.
        tokenizer: Tokenizer instance. If model is set as a string path,
            the tokenizer will be loaded from the checkpoint.
        device: String representation of device to run the model on. If None
            and cuda available it would be set to cuda:0 else cpu.
        max_length: Maximum length of tokens, input prompt + generated tokens.
        assistant_model: If set, this model will be used for
            speculative generation. If a string is passed, it should be the
            path to the hf converted checkpoint.
        generate_kwargs: Extra kwargs passed to the hf generate function.
    Returns:
        output_text: output generated as a string.
        generation_time: generation time in seconds.
    Raises:
        ValueError: If device is set to CUDA but no CUDA device is detected.
        ValueError: If tokenizer is not set.
        ValueError: If hf_access_token is not specified.
    """
    if not device:
        if torch.cuda.is_available() and torch.cuda.device_count():
            device = "cuda:0"
            logging.warning(
                'inference device is not set, using cuda:0, %s',
                torch.cuda.get_device_name(0)
            )
        else:
            device = 'cpu'
            logging.warning(
                (
                    'No CUDA device detected, using cpu, '
                    'expect slower speeds.'
                )
            )

    if 'cuda' in device and not torch.cuda.is_available():
        raise ValueError('CUDA device requested but no CUDA device detected.')

    if not tokenizer:
        raise ValueError('Tokenizer is not set in the generate function.')

    if not hf_access_token:
        raise ValueError((
            'Hugging face access token needs to be specified. '
            'Please refer to https://huggingface.co/docs/hub/security-tokens'
            ' to obtain one.'
            )
        )

    if isinstance(model, str):
        checkpoint_path = model
        model = AutoModelForCausalLM.from_pretrained(
            checkpoint_path,
            trust_remote_code=True
        )
    else:
        model = model

    model.to(device).eval()
    if isinstance(tokenizer, str):
        tokenizer = AutoTokenizer.from_pretrained(
            tokenizer,
            token=hf_access_token,
        )
    else:
        tokenizer = tokenizer

    # Speculative mode
    draft_model = None
    if assistant_model:
        draft_model = assistant_model
        if isinstance(assistant_model, str):
            draft_model = AutoModelForCausalLM.from_pretrained(
                assistant_model,
                trust_remote_code=True
            )
        draft_model.to(device).eval()

    # Prepare the prompt
    tokenized_prompt = tokenizer(prompt)
    tokenized_prompt = torch.tensor(
        tokenized_prompt['input_ids'],
        device=device
    )

    tokenized_prompt = tokenized_prompt.unsqueeze(0)


    # Generate
    stime = time.time()
    output_ids = model.generate(
        tokenized_prompt,
        max_length=max_length,
        pad_token_id=0,
        assistant_model=draft_model,
        **(generate_kwargs if generate_kwargs else {}),
    )
    generation_time = time.time() - stime

    output_text = tokenizer.decode(
        output_ids[0][tokenized_prompt.shape[1]:].tolist(),
        skip_special_tokens=True
    )

    return output_text, generation_time

## Implement your main function here
The input `abstract` is a `str` that forms an abstract of a research paper.
Your function will be invoked for returning the **sentence(s)** from the `abstract` that show the **research methodology**.

In [6]:
def extract_sentence(abstract: str) -> str:
    prompt = "From the following abstract, extract the sentences that shows the methods of the research. Only the sentences from the abstract, no other information.\n\n\n```%s```" % abstract
    output_text, genertaion_time = generate(
        prompt=prompt,
        # model="apple/OpenELM-1_1B-Instruct",
        hf_access_token=userdata.get('HF_TOKEN'),
        tokenizer=tokenizer,
        model=model,
    )
    return output_text

Your function is expected to be used as follows.

In [7]:
abstract = """The reliability of self-labeled data is an important issue when the data are regarded as ground-truth for training and testing learning-based models.
This paper addresses the issue of false-alarm hashtags in the self-labeled data for irony detection.
We analyze the ambiguity of hashtag usages and propose a novel neural network-based model, which incorporates linguistic information from different aspects, to disambiguate the usage of three hashtags that are widely used to collect the training data for irony detection.
Furthermore, we apply our model to prune the self-labeled training data.
Experimental results show that the irony detection model trained on the less but cleaner training instances outperforms the models trained on all data."""

predicted = extract_sentence(abstract)
print(predicted)






```The reliability of self-labeled data is an important issue when the data are regarded as ground-truth for training and testing learning-based models.
This paper addresses the issue of false-alarm hashtags in the self-labeled data for irony detection.
We analyze the ambiguity of hashtag usages and propose a novel neural network-based model, which incorporates linguistic information from different aspects, to disambiguate the usage of three hashtags that are widely used to collect the training data for irony detection.
Furthermore, we apply our model to prune the self-labeled training data.
Experimental results show that the irony detection model trained on the less but cleaner training instances outperforms the models trained on all data.```


```The reliability of self-labeled data is an important issue when the data are regarded as ground-truth for training and testing learning-based models.
This paper addresses the issue of false-alarm hashtags in the self-labeled data for iron

## Evaluation

We will evaluate your module with a close testset.
The sentence returned by your function will be compared with a golden reference.
The evaluation metric is `ROUGE-L`, which measures the overlap ratio between a predicted output and a reference. The details will be introduced in class.

In [8]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=bb3a41994a037f5cbefb78af158ac4ac29dc4a59698937395d6a12c68dee9eca
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [9]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rougeL'])

In [10]:
reference = """We analyze the ambiguity of hashtag usages and propose a novel neural network-based model, which incorporates linguistic information from different aspects, to disambiguate the usage of three hashtags that are widely used to collect the training data for irony detection. Furthermore, we apply our model to prune the self-labeled training data."""

print(scorer.score(reference, predicted)['rougeL'].fmeasure)

0.16987179487179488


In [11]:
predicted

'\n\n\n```The reliability of self-labeled data is an important issue when the data are regarded as ground-truth for training and testing learning-based models.\nThis paper addresses the issue of false-alarm hashtags in the self-labeled data for irony detection.\nWe analyze the ambiguity of hashtag usages and propose a novel neural network-based model, which incorporates linguistic information from different aspects, to disambiguate the usage of three hashtags that are widely used to collect the training data for irony detection.\nFurthermore, we apply our model to prune the self-labeled training data.\nExperimental results show that the irony detection model trained on the less but cleaner training instances outperforms the models trained on all data.```\n\n\n```The reliability of self-labeled data is an important issue when the data are regarded as ground-truth for training and testing learning-based models.\nThis paper addresses the issue of false-alarm hashtags in the self-labeled d

In [12]:
def evaluate(foo):
    import urllib.request
    test = "https://www.cs.nccu.edu.tw/~hhhuang/courses/nlp2024/test2024.in"
    gold = "https://www.cs.nccu.edu.tw/~hhhuang/courses/nlp2024/test2024.gold"

    from rouge_score import rouge_scorer
    scorer = rouge_scorer.RougeScorer(['rougeL'])

    total = 0
    cnt = 0
    with urllib.request.urlopen(test) as testin, \
         urllib.request.urlopen(gold) as gold:
        for input, ref in zip(testin, gold):
            input = input.decode("utf-8")
            ref = ref.decode("utf-8")
            output = foo(input)
            score = scorer.score(ref, output)['rougeL'].fmeasure
            cnt += 1
            total += score
            print("Test case %d: %g" % (cnt, score))
    print("Overall: %g" % (total / cnt))
    return total / cnt

# As your working function is `extract_sentence`, so do evaluation with the following statement
evaluate(extract_sentence)



Test case 1: 0.278164




Test case 2: 0.319574




Test case 3: 0.312012




Test case 4: 0.200318




Test case 5: 0.123779




Test case 6: 0.165079




Test case 7: 0.111801




Test case 8: 0.0381944




Test case 9: 0.0429338




Test case 10: 0.122271




Test case 11: 0.238994




Test case 12: 0.287443




Test case 13: 0.353553




Test case 14: 0.142248




Test case 15: 0.356164




Test case 16: 0.0675676




Test case 17: 0.274882




Test case 18: 0.384831




Test case 19: 0.288828




Test case 20: 0.088685




Test case 21: 0.166667




Test case 22: 0.142596




Test case 23: 0.112805




Test case 24: 0.14




Test case 25: 0.0834725




Test case 26: 0.346749




Test case 27: 0.296089




Test case 28: 0.323484




Test case 29: 0.197461




Test case 30: 0.649231




Test case 31: 0.369281




Test case 32: 0.0325926




Test case 33: 0.209106




Test case 34: 0.173776




Test case 35: 0.141667




Test case 36: 0.0875203




Test case 37: 0.0378788




Test case 38: 0.105622




Test case 39: 0.00884956




Test case 40: 0.177253




Test case 41: 0.142114




Test case 42: 0.258359




Test case 43: 0.304478




Test case 44: 0.382979




Test case 45: 0.0250896




Test case 46: 0.0362438




Test case 47: 0.0868217




Test case 48: 0.177858




Test case 49: 0.00648298




Test case 50: 0.045977




Test case 51: 0.287051




Test case 52: 0.288499




Test case 53: 0.111929




Test case 54: 0.0573888




Test case 55: 0.234206




Test case 56: 0.144231




Test case 57: 0.033264




Test case 58: 0.259375




Test case 59: 0.285714




Test case 60: 0.369327




Test case 61: 0.197183




Test case 62: 0.0361664




Test case 63: 0.255537




Test case 64: 0.266118




Test case 65: 0.0488599




Test case 66: 0.116




Test case 67: 0.067086




Test case 68: 0.0743034




Test case 69: 0.251429




Test case 70: 0.309963




Test case 71: 0.119097




Test case 72: 0.204633




Test case 73: 0.224852




Test case 74: 0.120623




Test case 75: 0.10327




Test case 76: 0.046133




Test case 77: 0.166008




Test case 78: 0.219653




Test case 79: 0.0662824




Test case 80: 0.204204




Test case 81: 0.137285




Test case 82: 0.106061




Test case 83: 0.218409




Test case 84: 0




Test case 85: 0.211356




Test case 86: 0.0542857




Test case 87: 0.105691




Test case 88: 0.0983607




Test case 89: 0.0634921




Test case 90: 0.389776




Test case 91: 0.189474




Test case 92: 0.18887




Test case 93: 0.296435




Test case 94: 0




Test case 95: 0




Test case 96: 0.412331




Test case 97: 0.33271




Test case 98: 0.0645161




Test case 99: 0.0820189




Test case 100: 0.422665
Overall: 0.180079


0.1800794876756535