In [37]:
pip install "sagemaker>=2.140.0" "transformers==4.26.1" "datasets[s3]==2.10.1" --upgrade

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


### Sage maker config

In [38]:
import sagemaker

sess = sagemaker.Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


### Data loading

In [39]:
import boto3
import pandas as pd
from datasets import Dataset


def score_sentiment_multiclass(x):
  """
  Transform the text labels into codes.
  """
  if x == "positive":
    return 0

  if x == "negative":
    return 1

  if x == "neutral":
    return 2
  
  return 0

bucket = 'my-nlp-datalake'

data_key = 'pos_labeled_data.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)
df_pred = pd.read_csv(data_location)

df_pred["target"] = df_pred["sentiment"].apply(score_sentiment_multiclass)

n_samples = 10000

# load Dataset from Pandas DataFrame
dataset = Dataset.from_pandas(df_pred[["text", "target", "sentiment", "id"]].iloc[:n_samples])
ds = dataset.train_test_split()

In [62]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"


# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# create tokenization function
def tokenize(batch):
    return tokenizer(batch["clean_message"], padding="max_length", truncation=True)

tokenized_dataset = ds.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text", "sentiment", "id"])
tokenized_dataset = tokenized_dataset.rename_column("target", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# tokenize train and test datasets
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [63]:
train_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 7500
})

In [64]:
test_dataset.format

{'type': 'torch',
 'format_kwargs': {},
 'columns': ['input_ids', 'attention_mask', 'labels'],
 'output_all_columns': False}

In [65]:
sess.default_bucket()

'sagemaker-eu-central-1-464909088200'

In [66]:
# save train_dataset to s3
s3_prefix = "datasets/telegram_sentiment"

training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path)

# save test_dataset to s3
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'
test_dataset.save_to_disk(test_input_path)

Saving the dataset (0/1 shards):   0%|          | 0/7500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2500 [00:00<?, ? examples/s]

In [71]:
# training job

from sagemaker.huggingface import HuggingFace

hyperparameters={
    "epochs": 1,                            # number of training epochs
    "train_batch_size": 32,                 # training batch size
    "model_name": model_checkpoint  # name of pretrained model
}

huggingface_estimator = HuggingFace(
    entry_point="train.py",                 # fine-tuning script to use in training job
    source_dir="./scripts",                 # directory where fine-tuning script is stored
    instance_type="ml.p3.2xlarge",          # instance type
    instance_count=1,                       # number of instances
    role=role,                              # IAM role used in training job to acccess AWS resources (S3)
    transformers_version="4.26",             # Transformers version
    pytorch_version="1.13",                  # PyTorch version
    py_version="py39",                      # Python version
    hyperparameters=hyperparameters         # hyperparameters to use in training job
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [72]:
huggingface_estimator.fit({"train": training_input_path, "test": test_input_path})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2023-09-11-16-52-44-390


Using provided s3_resource
2023-09-11 16:52:44 Starting - Starting the training job...
2023-09-11 16:53:11 Starting - Preparing the instances for training......
2023-09-11 16:54:09 Downloading - Downloading input data...
2023-09-11 16:54:29 Training - Downloading the training image........................
2023-09-11 16:58:20 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-09-11 16:58:38,125 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-09-11 16:58:38,147 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-09-11 16:58:38,160 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-09-11 16:58:38,163 sagemaker_pytorch_container.training INFO     Invoking user training script.[0