In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
from datasets import Dataset
import pandas as pd
import torch
from pathlib import Path
from dotenv import load_dotenv
import os

load_dotenv()
PROJECT_ROOT = Path(os.getenv('PROJECT_ROOT')).resolve()
MODEL_ROOT = Path(os.getenv('MODEL_ROOT')).resolve()
DATA_ROOT = Path(os.getenv('DATA_ROOT')).expanduser().resolve()

login(token=os.getenv("HF_TOKEN"))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model_path = Path(str(os.getenv("MODEL_ROOT"))).resolve() / 'sft' / 'TinyLlama' / 'TinyLlama-1.1B-Chat-v1.0'
model_path = MODEL_ROOT / "experiment2" / "cyclic" / "checkpoint-1000"
model_path = str(model_path)

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)
model.warnings_issued = {}

Some weights of the model checkpoint at /src/gs25009/LLM_DAG_ALLIGN/models/experiment2/cyclic/checkpoint-1000 were not used when initializing LlamaForCausalLM: ['v_head.summary.bias', 'v_head.summary.weight']
- This IS expected if you are initializing LlamaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
dataset_path = DATA_ROOT / 'cnn_dailymail'
df = pd.read_csv(str(dataset_path / 'test.csv'))
dataset = Dataset.from_pandas(df)

In [6]:
print(dataset[1942]['article'])
print(dataset[1942]['highlights'])
print(len(dataset))

Binge drinking as a teenager can cause long-lasting damage to the brain well into adulthood, a new study has warned. This is because drinking excessive amounts of alcohol when young can damage the brain and cause permanent changes to DNA. This, in turn, can put teenagers at risk of anxiety disorders and alcoholism, researchers found. Alcohol changes genes in brain cells, which stop the cells developing connections between them, altering behaviour, they said. However, they discovered a cancer drug may reverse the damage. Binge drinking as a teenager can cause long-lasting damage into adulthood, putting young people at risk of anxiety disorders and alcoholism . Professor Subhash Pandey, of the University of Illinois College of Medicine, explained that binge drinking during the teenage years disrupts the brain's normal development. He said: 'Our study provides a mechanism for how binge-drinking during adolescence may lead to lasting changes that result in increased anxiety and alcoholism 

In [8]:
# test
article = dataset[1941]['article']
messages = [
    {
        "role": "user", 
        "content": f"{article}\n\nTL;DR:" 
    }
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

output_ids = model.generate(input_ids, max_new_tokens=400)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(output_text)

<|user|>
Christian Benteke rescued Aston Villa from the drop zone with a brilliant late free-kick to seal his hat-trick and salvage a point against relegation rivals Queens Park Rangers. Tim Sherwood’s team were set to drop into the bottom three when Charlie Austin scored his 17th Premier League goal of the season 12 minutes from the end, before Benteke’s intervention five minutes later to cap off a pulsating game. The point lifts QPR above Burnley to 18th on goal difference, while Villa move above Hull to 16th. Tim Sherwood was disappointed that Aston Villa's domination of QPR did not result in three points for his side . Sherwood did concede that the point taken could be vital and praised Christian Benteke for his hat-trick . Sherwood said: ‘I’m disappointed with a point when we’ve dominated the match like that. The referee should have stopped it at half-time. Anyone who has seen that football match knows it shouldn’t have been close. ‘But it’s not about what you deserve, it’s about 

In [7]:
def get_summary_no_prompt(content, model):
    message = [
        {
            "role": "user",
            "content": content
        }
    ]
    prompt = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)

    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    attention_mask = torch.ones_like(input_ids)
    
    output_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=500,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        top_k=50,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )  
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    assistant_tag = "<|assistant|>"
    if assistant_tag in output_text:
        output_text = output_text.split(assistant_tag, 1)[1].strip()

    return output_text, output_ids.shape[-1] - input_ids.shape[-1]

In [10]:
def get_summary(content, model):
    message = [
        {
            "role": "user",
            # "content": f"Summarize the following text in a TL;DR style in one sentence\n\n{content}\n",
            "content": f"TL;DR: \n\n{content}\n"
        }
    ]
    prompt = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)

    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    attention_mask = torch.ones_like(input_ids)
    
    output_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=500,
        do_sample=True,
        temperature=1.3,
        top_p=0.95,
        top_k=50,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )  
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    assistant_tag = "<|assistant|>"
    if assistant_tag in output_text:
        output_text = output_text.split(assistant_tag, 1)[1].strip()

    return output_text, output_ids.shape[-1] - input_ids.shape[-1]

In [None]:
avarge_length = 0
for i in range(1000):
    content = dataset[i]['article']
    length = tokenizer.encode(content, return_tensors="pt").shape[-1]
    avarge_length += length
print(f"Average length of articles: {avarge_length / 1000}")

In [11]:
for k in [2, 3]:
    print("prompt", k)
    print(dataset[k]['article'])
    for i in range(1, 6):
        print(f"summary {i}")
        summary, _ = get_summary(dataset[k]['article'], model)
        print(summary, "\n", _)

prompt 2
Dougie Freedman is on the verge of agreeing a new two-year deal to remain at Nottingham Forest. Freedman has stabilised Forest since he replaced cult hero Stuart Pearce and the club's owners are pleased with the job he has done at the City Ground. Dougie Freedman is set to sign a new deal at Nottingham Forest . Freedman has impressed at the City Ground since replacing Stuart Pearce in February . They made an audacious attempt on the play-off places when Freedman replaced Pearce but have tailed off in recent weeks. That has not prevented Forest's ownership making moves to secure Freedman on a contract for the next two seasons.
summary 1
Fweek-long freebone has been at Nottingham Forest since December .
That is because of the job he has done with the club as well as the owners .
His future should be in the hands of the new two-year deal . 
 55
summary 2
Freedman is set to sign new deal at Nottingham Forest .
The former Man United man has been at the East East London club since 2