In [None]:
import boto3 
import json 

from contextlib import contextmanager 
from io import BytesIO 
from tempfile import NamedTemporaryFile 
from transformers import PretrainedConfig, AutoModelForSequenceClassification, AutoTokenizer
  
@contextmanager 
def s3_fileobj(bucket, key): 
    """
    Yields a file object from the filename at {bucket}/{key}

    Args:
        bucket (str): Name of the S3 bucket where you model is stored
        key (str): Relative path from the base of your bucket, including the filename and extension of the object to be retrieved.
    """
    s3 = boto3.client("s3") 
    obj = s3.get_object(Bucket=bucket, Key=key) 
    yield BytesIO(obj["Body"].read()) 
 
def load_model(bucket, path_to_model, model_name='pytorch_model'):
    """
    Load a model at the given S3 path. It is assumed that your model is stored at the key:

        '{path_to_model}/{model_name}.bin'

    and that a config has also been generated at the same path named:

        f'{path_to_model}/config.json'

    """
    tempfile = NamedTemporaryFile() 
    with s3_fileobj(bucket, f'{path_to_model}/{model_name}.bin') as f: 
        tempfile.write(f.read()) 
 
    with s3_fileobj(bucket, f'{path_to_model}/config.json') as f: 
        dict_data = json.load(f) 
        config = PretrainedConfig.from_dict(dict_data) 
 
    model = AutoModelForSequenceClassification.from_pretrained(tempfile.name, config=config) 
    return model 
     
model = load_model(sagemaker_session_bucket, 'huggingface-pytorch-training-2023-01-13-17-26-31-869/output/')

In [None]:
import sagemaker
import json

session = sagemaker.Session()

role = sagemaker.get_execution_role()
role_name = role.split('/')[-1]

#sagemaker_session_bucket = "walkthrough-bucket-hf-aws"
sagemaker_session_bucket = session.default_bucket()

In [None]:
session = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {session.default_bucket()}")
print(f"sagemaker session region: {session.boto_region_name}")

In [None]:
tokenizer_name = "distilbert-base-uncased"

dataset_name = "imdb"

s3_prefix = "samples/datasets/imdb"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)