#### Prerequisites

In [2]:
%%capture 

!pip install transformers==4.18.0
!pip install datasets==2.4.0

### Imports 

In [3]:
from transformers import GPT2LMHeadModel
from transformers import GPT2Tokenizer
from transformers import pipeline
from datasets import load_metric 
import transformers
import pandas as pd
import datasets
import logging
import tarfile
import os

In [4]:
pd.options.display.max_colwidth = 100

##### Setup logging

In [5]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [6]:
logger.info(f'[Using transformers: {transformers.__version__}]')
logger.info(f'[Using datasets: {datasets.__version__}]')

[Using transformers: 4.18.0]
[Using datasets: 2.4.0]


### Copy candidate models from S3 to local for evaluation 

In [7]:
!aws s3 cp s3://sagemaker-us-east-1-119174016168/model/custom/ ./models/pretrained-from-scratch/ --recursive

download: s3://sagemaker-us-east-1-119174016168/model/custom/config.json to models/pretrained-from-scratch/config.json
download: s3://sagemaker-us-east-1-119174016168/model/custom/special_tokens_map.json to models/pretrained-from-scratch/special_tokens_map.json
download: s3://sagemaker-us-east-1-119174016168/model/custom/training_args.bin to models/pretrained-from-scratch/training_args.bin
download: s3://sagemaker-us-east-1-119174016168/model/custom/tokenizer_config.json to models/pretrained-from-scratch/tokenizer_config.json
download: s3://sagemaker-us-east-1-119174016168/model/custom/merges.txt to models/pretrained-from-scratch/merges.txt
download: s3://sagemaker-us-east-1-119174016168/model/custom/vocab.json to models/pretrained-from-scratch/vocab.json
download: s3://sagemaker-us-east-1-119174016168/model/custom/tokenizer.json to models/pretrained-from-scratch/tokenizer.json
download: s3://sagemaker-us-east-1-119174016168/model/custom/pytorch_model.bin to models/pretrained-from-scra

In [8]:
!aws s3 cp s3://sagemaker-us-east-1-119174016168/model/finetuned/ ./models/oob-finetuned/ --recursive

download: s3://sagemaker-us-east-1-119174016168/model/finetuned/special_tokens_map.json to models/oob-finetuned/special_tokens_map.json
download: s3://sagemaker-us-east-1-119174016168/model/finetuned/training_args.bin to models/oob-finetuned/training_args.bin
download: s3://sagemaker-us-east-1-119174016168/model/finetuned/tokenizer_config.json to models/oob-finetuned/tokenizer_config.json
download: s3://sagemaker-us-east-1-119174016168/model/finetuned/config.json to models/oob-finetuned/config.json
download: s3://sagemaker-us-east-1-119174016168/model/finetuned/merges.txt to models/oob-finetuned/merges.txt
download: s3://sagemaker-us-east-1-119174016168/model/finetuned/tokenizer.json to models/oob-finetuned/tokenizer.json
download: s3://sagemaker-us-east-1-119174016168/model/finetuned/vocab.json to models/oob-finetuned/vocab.json
download: s3://sagemaker-us-east-1-119174016168/model/finetuned/pytorch_model.bin to models/oob-finetuned/pytorch_model.bin


### Load candidate models for evaluation 

##### Load Out-of-the-box (OOB) GPT2

In [9]:
oob_gpt2 = pipeline('text-generation', model='gpt2')

##### Load OOB GPT2 fine-tuned on covid news articles 

In [10]:
oob_gpt2_finetuned = pipeline('text-generation', model='./models/oob-finetuned/')

##### Load custom GPT2 further pre-trained (trained from scratch) on covid news articles 

In [11]:
custom_gpt2 = pipeline('text-generation', model='./models/pretrained-from-scratch/')

### Evaluate reference articles againsts the candidate models

In [12]:
ref_df = pd.read_csv('./data/test_articles.csv', names=['reference_article', 'prompt'])
ref_df

Unnamed: 0,reference_article,prompt
0,"On Tuesday, Dr. Fauci and other health officials testified before the U.S. House Energy and Comm...","On Tuesday, Dr. Fauci and other health officials"
1,Pfizer Inc. on Wednesday reported results from two late-stage studies ahead of schedule as it pu...,Pfizer Inc. on Wednesday reported results
2,President Donald Trump said the U.S. has the outbreak of the coronavirus under control and has b...,President Donald Trump said the U.S. has the outbreak
3,"President Joe Biden has signed a flurry of executive orders, actions and memorandums aimed at ra...",President Joe Biden has signed a flurry of executive orders
4,"China is effectively in a lockdown. From big cities to little villages, almost every community i...",China is effectively in a lockdown.
5,"Australian biotech Mesoblast has been riding high on expectations for its COVID-19 treatment, li...",Australian biotech Mesoblast has been riding high on expectations for its
6,"The December, 2019 coronavirus disease outbreak has seen many countries ask people who have pote...","The December, 2019 coronavirus disease outbreak has seen many countries ask people who have pote..."
7,The first confirmed case of coronavirus in India was reported today ( Jan. 30) in the southern s...,The first confirmed case of coronavirus in India was


In [13]:
for _, row in ref_df.iterrows():
    ref_article, prompt = row
    custom_gpt2_response = custom_gpt2(prompt, num_return_sequences=1, max_length=300, repetition_penalty=10.0, top_k=1, top_p=1.0)[0]['generated_text']
    oob_gpt2_finetuned_response = oob_gpt2_finetuned(prompt, num_return_sequences=1, max_length=300, repetition_penalty=10.0, top_k=1, top_p=1.0)[0]['generated_text']
    oob_gpt2_response = oob_gpt2(prompt, num_return_sequences=1, max_length=300, repetition_penalty=10.0, top_k=1, top_p=1.0)[0]['generated_text']
    print(f'Prompt: {prompt}')
    print()
    print(f'Ref article: {ref_article}')
    print()
    print(f'Custom GPT2 Response: {custom_gpt2_response}')
    print()
    print(f'OOB GPT2 Finetuned Response: {oob_gpt2_finetuned_response}')
    print()
    print(f'OOB GPT2 Response: {oob_gpt2_response}')
    print('-' * 200)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


[2023-01-26 19:15:42.742 pytorch-1-8-gpu-py3-ml-g4dn-xlarge-60bd0d07a83be181dcf7335baae2:4936 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2023-01-26 19:15:42.775 pytorch-1-8-gpu-py3-ml-g4dn-xlarge-60bd0d07a83be181dcf7335baae2:4936 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Prompt: On Tuesday, Dr. Fauci and other health officials

Ref article: On Tuesday, Dr. Fauci and other health officials testified before the U.S. House Energy and Commerce Committee to discuss how the administration has been handling the coronavirus outbreak. Yahoo Finance’s Anjalee Khemlani breaks down the latest news about the coronavirus on The Final Round.

Custom GPT2 Response: On Tuesday, Dr. Fauci and other health officials have been urging the government to take steps like banning alcohol sales in supermarkets as a way of preventing covid-19 transmission among staff at public hospitals that are already struggling with staffing shortages due partly or entirely from coronavirus patients being discharged into care homes for older people who need it most ( see chart). but they say this is not enough: “ we must ensure all healthcare workers can safely continue their work without fear endangering themselves by going out on sick leave ”. some doctors believe there should be no restric

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Prompt: Pfizer Inc. on Wednesday reported results

Ref article: Pfizer Inc. on Wednesday reported results from two late-stage studies ahead of schedule as it put off its March 31 investor day amid the coronavirus outbreak. The drugmaker said its experimental treatment, abrocitinib, was effective in treating atopic dermatitis in combination with topical therapies in a late-stage study. In addition, it also reported positive top-line results from another late-stage study testing its pneumococcal conjugate vaccine candidate in adults 18 years of age or older not previously vaccinated against pneumococcal disease, a type of bacterial infection.

Custom GPT2 Response: Pfizer Inc. on Wednesday reported results of the first phase iii study, which is being conducted in collaboration with astrazeneca and oxford university’ s jenner institute ( dsi), showing that a single dose given to patients who had been infected by sars-cov2 was safe for upto six months after their second shot; this time it 

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Prompt: President Donald Trump said the U.S. has the outbreak

Ref article: President Donald Trump said the U.S. has the outbreak of the coronavirus under control and has been briefed by the Centers for Disease Control and Prevention. Speaking to CNBC, Trump said he wasn't worried it would turn into a pandemic and said the only person infected had flown in from China. He repeated his view that the impeachment is a hoax. Trump batted away a question on whether the Fed's balance sheet was the prime reason for the stock-market SPX, +2.64% gains. He said Fed interest rates should still go lower because the dollar DXY, -0.19% is strong.

Custom GPT2 Response: President Donald Trump said the U.S. has the outbreak of coronavirus been contained? `` we have not seen a significant impact on our economy, '' he added in an interview with reuters via telephone from rome's la repubblica newspaper last week when asked about his view that china is likely to be hit by covid-19 first and then later than

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Prompt: President Joe Biden has signed a flurry of executive orders

Ref article: President Joe Biden has signed a flurry of executive orders, actions and memorandums aimed at rapidly addressing the coronavirus pandemic and dismantling many of President Donald Trump's policies. The 30 executive actions Biden has taken in the first days of his administration include halting funding for the construction of Trump's border wall, reversing Trump's travel ban targeting largely Muslim countries, imposing a mask mandate on federal property, ramping up vaccination supplies and requiring international travelers to provide proof of a negative Covid-19 test prior to traveling to the US

Custom GPT2 Response: President Joe Biden has signed a flurry of executive orders to promote the use and availability, including an order that would allow for “ emergency medical assistance ” in cases where there is no available alternative. [ 1 ] this includes providing support through subsidies or other forms suc

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Prompt: China is effectively in a lockdown.

Ref article: China is effectively in a lockdown. From big cities to little villages, almost every community is under quarantine to a varying degree, or at least faces some travel restrictions. There is little information on how long this will last. One thing for sure is that the government is willing to keep the country in lockdown until the virus outbreak comes under control. A government mobilisation on this scale is unprecedented.

Custom GPT2 Response: China is effectively in a lockdown. the only way to get out of this nightmare, and keep everyone safe from covid-19 infection for as long or longer than necessary was by getting vaccinated against it first before going on holiday abroad ( which has been banned since march 2020). but there are other ways you can help: if your loved one gets sick with coronavirus then they should be able access care at home without worrying about being hospitalised; even though their condition may not improv

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Prompt: Australian biotech Mesoblast has been riding high on expectations for its

Ref article: Australian biotech Mesoblast has been riding high on expectations for its COVID-19 treatment, licensed to Novartis, but fell back to Earth after it said a phase 3 trial of the cell therapy was a bust. Shares in the stem cell specialist on the ASX lost more than a third of their value after data experts said the study of remestemcel-L in ventilator-dependent patients with moderate to severe acute respiratory distress syndrome ( ARDS) due to COVID-19 was unlikely to show a benefit. Mesoblast said the trial could have been affected by improvements in the care of COVID-19 patients over the last few months, as doctors gathered experience in treating the disease. That included the use of experimental drugs like dexamethasone and Gilead’ s antiviral Veklury (remdesivir).

Custom GPT2 Response: Australian biotech Mesoblast has been riding high on expectations for its covid-19 vaccine candidate, whic

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Prompt: The December, 2019 coronavirus disease outbreak has seen many countries ask people who have potentially come into contact with the infection to isolate themselves

Ref article: The December, 2019 coronavirus disease outbreak has seen many countries ask people who have potentially come into contact with the infection to isolate themselves at home or in a dedicated quarantine facility. Decisions on how to apply quarantine should be based on the best available evidence. This review of the psychological impact of quarantine using three electronic databases. Of 3166 papers found, 24 are included in this Review. Most reviewed studies reported negative psychological effects including post-traumatic stress symptoms, confusion, and anger. Stressors included longer quarantine duration, infection fears, frustration, boredom, inadequate supplies, inadequate information, financial loss, and stigma.

Custom GPT2 Response: The December, 2019 coronavirus disease outbreak has seen many countrie

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: The first confirmed case of coronavirus in India was

Ref article: The first confirmed case of coronavirus in India was reported today ( Jan. 30) in the southern state of Kerala. The patient, a female student at Wuhan University in China, tested positive for the novel coronavirus after returning to Kerala. Kerala health minister KK Shailaja has called an emergency meeting at 3pm.

Custom GPT2 Response: The first confirmed case of coronavirus in India was diagnosed with covid-19 on march 11th, 2020. he had been working from home since then and his wife has also tested positive for the virus but is asymptomatic so far no symptoms have shown up at work or school as a result ( see chart).asked by an email to staff last week: “ i am not aware that this happened before my return flight back into australia ”.the company’ s chief executive officer said it would be impossible without him being able access jobkeeper payments via their employer if they were unable do otherwise because the

### Save custom GPT2 pipeline 

In [14]:
custom_gpt2.save_pretrained('./models/pipeline/')

##### Archive the saved artifacts 

In [15]:
def get_file_paths(directory: str) -> list:
        file_paths = [] 
        for root, directories, files in os.walk(directory):
            for file_name in files:
                file_path = os.path.join(root, file_name)
                file_paths.append(file_path)  
        return file_paths

In [16]:
def tar_artifacts(local_artifacts_path: str, tar_save_path: str, tar_name: str) -> None:
        if not os.path.exists(tar_save_path):
            os.makedirs(tar_save_path, exist_ok=True)
        tar = tarfile.open(f'{tar_save_path}/{tar_name}', 'w:gz')
        file_paths = get_file_paths(local_artifacts_path)  
        for file_path in file_paths:
            file_ = file_path.split('/')[-1]
            try:
                tar.add(file_path, arcname=file_) 
            except OSError:
                logger.info('Ignoring OSErrors during tar creation.')
        tar.close()

In [17]:
tar_artifacts('./models/pipeline/', './models/pipeline/', 'custom-gpt2-pipeline.tar.gz')

##### Copy archive from local to S3 

In [18]:
!aws s3 cp ./models/pipeline/custom-gpt2-pipeline.tar.gz s3://sagemaker-us-east-1-119174016168/model/pipelines/pipeline.tar.gz

upload: models/pipeline/custom-gpt2-pipeline.tar.gz to s3://sagemaker-us-east-1-119174016168/model/pipelines/pipeline.tar.gz
