# Hf_Emptions Predictor

In [2]:
!pip install "sagemaker>=2.48.0" "transformers==4.6.1" "datasets[s3]==1.6.2" --upgrade

Collecting datasets[s3]==1.6.2
  Using cached datasets-1.6.2-py3-none-any.whl (221 kB)
Collecting botocore==1.19.52
  Using cached botocore-1.19.52-py2.py3-none-any.whl (7.2 MB)
Collecting boto3==1.16.43
  Using cached boto3-1.16.43-py2.py3-none-any.whl (130 kB)
Collecting s3transfer<0.4.0,>=0.3.0
  Using cached s3transfer-0.3.7-py2.py3-none-any.whl (73 kB)
Collecting sagemaker>=2.48.0
  Using cached sagemaker-2.77.1-py2.py3-none-any.whl
  Using cached sagemaker-2.77.0.tar.gz (513 kB)
  Preparing metadata (setup.py) ... [?25ldone
  Using cached sagemaker-2.76.0.tar.gz (512 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25h  Using cached sagemaker-2.75.1.tar.gz (511 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25h  Using cached sagemaker-2.75.0.tar.gz (511 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25h  Using cached sagemaker-2.74.0.tar.gz (481 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25h  Using cached sagemaker-2.73.0.tar.gz (481 kB)
  Preparing

In [3]:
import sagemaker.huggingface

In [4]:
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::647333706880:role/service-role/AmazonSageMaker-ExecutionRole-20210125T093214
sagemaker bucket: sagemaker-us-east-1-647333706880
sagemaker session region: us-east-1


## get model

In [5]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker 


In [6]:
# model location comes from end of `Hf_EmotionsEstimator` notebook
# but we must add a second '/' to 's3:/''
model_loc = 's3://sagemaker-us-east-1-647333706880/huggingface-pytorch-training-2022-02-28-18-47-41-609/output/model.tar.gz'

In [7]:
# create Hugging Face Model Class

huggingface_model = HuggingFaceModel(
   model_data=model_loc,
   role=role, # iam role with permissions to create an Endpoint
   transformers_version="4.6", # transformers version used
   pytorch_version="1.7", # pytorch version used
   py_version="py36", # python version of the DLC
)

In [None]:
# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
   initial_instance_count=1,
   instance_type="ml.m5.xlarge"
)

In [None]:
# quick predictor test
sentiment_input= {"inputs":"I love using the new Inference DLC."}

predictor.predict(sentiment_input)

### delete endpoint

In [None]:
# delete when finished
predictor.delete_endpoint()

## get test,train, validation data
Paths created and data stored in `Hf_EmotionEstimator`

In [8]:
from datasets import load_from_disk
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()  

In [9]:
[training_input_path, test_input_path,validation_input_path] = ['s3://sagemaker-us-east-1-647333706880/samples/datasets/emotion/train',
 's3://sagemaker-us-east-1-647333706880/samples/datasets/emotion/test',
 's3://sagemaker-us-east-1-647333706880/samples/datasets/emotion/validation']

In [10]:
train_dataset = load_from_disk(training_input_path,fs=s3)
test_dataset = load_from_disk(test_input_path,fs=s3)
validation_dataset = load_from_disk(validation_input_path,fs=s3)


## Hugging face batch transformer
* [youtube](https://bit.ly/3K2YmIi)
* [notebook](https://bit.ly/3pvF4n1)

In [11]:
import csv
import json
import sagemaker
from sagemaker.s3 import S3Uploader,s3_path_join


In [12]:
import pandas as pd
import io

In [13]:
import boto3,os
s3_prefix = 'samples/datasets/emotion'
s3_transform_prefix = s3_prefix + "/transforms"
[sess.default_bucket(),s3_prefix,s3_transform_prefix]

['sagemaker-us-east-1-647333706880',
 'samples/datasets/emotion',
 'samples/datasets/emotion/transforms']

In [14]:
sagemaker_session_bucket = sess.default_bucket()

In [15]:
# datset files
dataset_csv_file="validation.csv"
dataset_jsonl_file="validation.jsonl"


In [22]:
validation_dataset.set_format(type="pandas")
x = validation_dataset["text"].to_csv(index=False,header=False)
x="inputs\n" + x
buffer = io.StringIO(x)

xdf = pd.read_csv(filepath_or_buffer = buffer, header = 0)
xdf.to_csv(dataset_csv_file,index=False,header=1)

In [23]:
with open(dataset_csv_file, "r+") as infile, open(dataset_jsonl_file, "w+") as outfile:
    reader = csv.DictReader(infile)
    for row in reader:
        # remove @
        #row["inputs"] = row["inputs"].replace("@","")
        json.dump(row, outfile)
        outfile.write('\n')


In [24]:
# uploads a given file to S3.
input_s3_path = s3_path_join("s3://",sess.default_bucket(), s3_transform_prefix)
output_s3_path = s3_path_join("s3://",sess.default_bucket(), s3_transform_prefix)
s3_file_uri = S3Uploader.upload(dataset_jsonl_file,input_s3_path)


In [25]:
print(f"{dataset_jsonl_file} uploaded to {s3_file_uri}")

validation.jsonl uploaded to s3://sagemaker-us-east-1-647333706880/samples/datasets/emotion/transforms/validation.jsonl


In [26]:
# create Transformer to run our batch job
# ss this run
batch_job = huggingface_model.transformer(
    instance_count=1,
    instance_type='ml.p3.2xlarge',
    output_path=output_s3_path, # we are using the same s3 path to save the output with the input
    strategy='SingleRecord')

In [27]:
# starts batch transform job and uses s3 data as input
batch_job.transform(
    data=s3_file_uri,
    content_type='application/json',    
    split_type='Line')

...........................................[32m2022-03-02T19:43:56.557:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=SINGLE_RECORD[0m
[34m2022-03-02 19:43:55,429 [INFO ] main com.amazonaws.ml.mms.ModelServer - [0m
[34mMMS Home: /opt/conda/lib/python3.6/site-packages[0m
[34mCurrent directory: /[0m
[34mTemp directory: /home/model-server/tmp[0m
[34mNumber of GPUs: 1[0m
[34mNumber of CPUs: 8[0m
[34mMax heap size: 12949 M[0m
[35m2022-03-02 19:43:55,429 [INFO ] main com.amazonaws.ml.mms.ModelServer - [0m
[35mMMS Home: /opt/conda/lib/python3.6/site-packages[0m
[35mCurrent directory: /[0m
[35mTemp directory: /home/model-server/tmp[0m
[35mNumber of GPUs: 1[0m
[35mNumber of CPUs: 8[0m
[35mMax heap size: 12949 M[0m
[34mPython executable: /opt/conda/bin/python3.6[0m
[34mConfig file: /etc/sagemaker-mms.properties[0m
[34mInference address: http://0.0.0.0:8080[0m
[34mManagement address: http://0.0.0.0:8080[0m
[34mModel Store: /.sag

In [28]:
import json
from sagemaker.s3 import S3Downloader
from ast import literal_eval
# creating s3 uri for result file -> input file + .out
output_file = f"{dataset_jsonl_file}.out"
output_path = s3_path_join(output_s3_path,output_file)

# download file
S3Downloader.download(output_path,'.')

batch_transform_result = []
with open(output_file) as f:
    for line in f:
        # converts jsonline array to normal array
        line = "[" + line.replace("[","").replace("]",",") + "]"
        batch_transform_result = literal_eval(line) 
        
# print results 
print(batch_transform_result[:50])

[{'label': 'LABEL_1', 'score': 0.9994065761566162}, {'label': 'LABEL_1', 'score': 0.9992707371711731}, {'label': 'LABEL_2', 'score': 0.5559582114219666}, {'label': 'LABEL_2', 'score': 0.9951319098472595}, {'label': 'LABEL_2', 'score': 0.9961485266685486}, {'label': 'LABEL_5', 'score': 0.6463429927825928}, {'label': 'LABEL_4', 'score': 0.9978957176208496}, {'label': 'LABEL_1', 'score': 0.9989014863967896}, {'label': 'LABEL_1', 'score': 0.9991486668586731}, {'label': 'LABEL_4', 'score': 0.9977312684059143}, {'label': 'LABEL_3', 'score': 0.9983696937561035}, {'label': 'LABEL_1', 'score': 0.9993244409561157}, {'label': 'LABEL_0', 'score': 0.9992727637290955}, {'label': 'LABEL_4', 'score': 0.788913905620575}, {'label': 'LABEL_1', 'score': 0.9991980195045471}, {'label': 'LABEL_3', 'score': 0.998214066028595}, {'label': 'LABEL_5', 'score': 0.9915120005607605}, {'label': 'LABEL_4', 'score': 0.4989328384399414}, {'label': 'LABEL_0', 'score': 0.9991216063499451}, {'label': 'LABEL_1', 'score': 0.

## End Hugging face batch transformer


## look at transform results

In [87]:
from sklearn import metrics

In [77]:
def val_label_int2str(row):
    return validation_dataset.features["labels"].int2str(row)


In [78]:
validation_dataset.set_format(type="pandas")
val_df = validation_dataset[:]

# get lable class name that belongs to number
val_df["label_name"] = val_df["labels"].apply(val_label_int2str)

In [79]:
bt_df = pd.DataFrame(batch_transform_result)

# convert label_ pred to number
bt_df['pred_label_num']=bt_df["label"].apply(lambda x: int(x[-1]))

# get lable class name that belongs to number
bt_df['pred_label_label'] = bt_df['pred_label_num'].apply(val_label_int2str)

In [80]:
#merge the correct labels with the predicted
bt_df['val_label_num'] = val_df.labels
bt_df['val_label_label']= val_df.label_name

In [83]:
confusion_matrix = pd.crosstab(
    bt_df['val_label_label'], 
    bt_df['pred_label_label'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)

Predicted  anger  fear  joy  love  sadness  surprise
Actual                                              
anger        254     8    2     0       11         0
fear           5   208    0     0        5         6
joy            4     1  653    35        0         2
love           1     0   16   142        0         0
sadness       11     5    2     0      563         0
surprise       0    15    5     0        1        45


In [104]:
print(f"accuracy: {metrics.accuracy_score(bt_df['val_label_label'], bt_df['pred_label_label'])}")


print(f"precision: {metrics.precision_score(bt_df['val_label_label'], bt_df['pred_label_label'],average='weighted')}")

print(f"recall: {metrics.recall_score(bt_df['val_label_label'], bt_df['pred_label_label'],average='weighted')}")

print(f"f1 score: {metrics.f1_score(bt_df['val_label_label'], bt_df['pred_label_label'],average='weighted')}")

accuracy: 0.9325
precision: 0.9337658105899442
recall: 0.9325
f1 score: 0.9325067277066255


## end look at transform results

## start test

In [85]:
!pip install pandas_ml


Collecting pandas_ml
  Downloading pandas_ml-0.6.1-py3-none-any.whl (100 kB)
     |████████████████████████████████| 100 kB 1.4 MB/s            
[?25hCollecting enum34
  Downloading enum34-1.1.10-py3-none-any.whl (11 kB)
Installing collected packages: enum34, pandas-ml
Successfully installed enum34-1.1.10 pandas-ml-0.6.1


In [86]:
#old does not work
from pandas_ml import ConfusionMatrix

Confusion_Matrix = ConfusionMatrix(
    bt_df['val_label_label'], 
    bt_df['pred_label_label'])
Confusion_Matrix.print_stats()


AttributeError: module 'sklearn.metrics' has no attribute 'jaccard_similarity_score'

In [82]:
validation_dataset.features

{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'labels': ClassLabel(num_classes=6, names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], names_file=None, id=None),
 'text': Value(dtype='string', id=None)}

In [81]:
bt_df

Unnamed: 0,label,score,pred_label_num,pred_label_label,val_label_num,val_label_label
0,LABEL_1,0.999407,1,joy,1,joy
1,LABEL_1,0.999271,1,joy,1,joy
2,LABEL_2,0.555958,2,love,1,joy
3,LABEL_2,0.995132,2,love,2,love
4,LABEL_2,0.996149,2,love,2,love
...,...,...,...,...,...,...
1995,LABEL_3,0.998339,3,anger,3,anger
1996,LABEL_4,0.997159,4,fear,4,fear
1997,LABEL_3,0.998501,3,anger,3,anger
1998,LABEL_0,0.998711,0,sadness,0,sadness


In [256]:
def val_label_int2str(row):
    return validation_dataset.features["labels"].int2str(row)


In [72]:
bt_df.head()

Unnamed: 0,label,score,pred_label_num,pred_label_label
0,LABEL_1,0.999407,1,joy
1,LABEL_1,0.999271,1,joy
2,LABEL_2,0.555958,2,love
3,LABEL_2,0.995132,2,love
4,LABEL_2,0.996149,2,love


In [39]:
bt_df['pred_label_num']=bt_df["label"].apply(pred_int)

In [66]:
bt_df

Unnamed: 0,label,score,pred_label_num
0,LABEL_1,0.999407,1
1,LABEL_1,0.999271,1
2,LABEL_2,0.555958,2
3,LABEL_2,0.995132,2
4,LABEL_2,0.996149,2
...,...,...,...
1995,LABEL_3,0.998339,3
1996,LABEL_4,0.997159,4
1997,LABEL_3,0.998501,3
1998,LABEL_0,0.998711,0


In [48]:
bt_df["label"]

0       LABEL_1
1       LABEL_1
2       LABEL_2
3       LABEL_2
4       LABEL_2
         ...   
1995    LABEL_3
1996    LABEL_4
1997    LABEL_3
1998    LABEL_0
1999    LABEL_4
Name: label, Length: 2000, dtype: object

In [45]:
int(bt_df["label"][-1])

KeyError: -1

In [33]:
val_df.head()

Unnamed: 0,attention_mask,input_ids,labels,text
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 1045, 2123, 2102, 2113, 2021, 1045, 2514...",1,i dont know but i feel virtuous so i accept th...
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 1045, 5791, 2514, 2045, 1055, 2070, 6179...",1,i definitely feel there s some useful informat...
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 1045, 2514, 2004, 2295, 1045, 2031, 6414...",1,i feel as though i have merely accepted what h...
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 1045, 2514, 2008, 2002, 2003, 16448, 203...",2,i feel that he is gazing me and giving a naugh...
4,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[101, 1045, 2074, 2514, 8616, 102, 0, 0, 0, 0,...",2,i just feel tender


## end test

## does not work as is: 
start aws: create an awstransformer to run batches of input throught the model
### use a dataset (test or validation) depending on fit to test the model

see 
* [here](https://go.aws/3tieUp7)
* [Increasing the timeout for InvokeEndpoint · Issue #1119 · aws/sagemaker-python-sdk](https://bit.ly/3vA6jAK)

Status:
csv file doesnt seem formatted correctly. 
error is: 
* `2022-03-01 23:15:39,271 [INFO ] W-model-2-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle -     inputs = data.pop("inputs", data)`
* `2022-03-01T23:15:39.284:[sagemaker logs]: sagemaker-us-east-1-647333706880/samples/datasets/emotion/transforms/validation.csv:   "message": "\u0027numpy.ndarray\u0027 object has no attribute \u0027pop\u0027"`

In [None]:
x = validation_dataset["text"].to_csv('validation.csv',index=False,header=False)

In [None]:
import boto3,os
s3_prefix = 'samples/datasets/emotion'
s3_transform_prefix = s3_prefix + "/transforms"
[sess.default_bucket(),s3_prefix,s3_transform_prefix]

In [None]:
# Load file to s3 for transformer
boto3.Session().resource('s3').Bucket(sess.default_bucket()).Object(
    os.path.join(s3_transform_prefix, 'validation.csv')).upload_file('validation.csv')

In [None]:
# The location of the test dataset
batch_input = 's3://{}/{}'.format(sess.default_bucket(), s3_transform_prefix)

# The location to store the results of the batch transform job
batch_output = 's3://{}/{}/batch-prediction'.format(sess.default_bucket(), s3_transform_prefix)
[batch_input,batch_output]

In [None]:
transformer = huggingface_model.transformer(
    instance_count=1, 
    instance_type='ml.m4.xlarge', 
    output_path=batch_output
)

In [None]:
transformer.transform(
    data=batch_input, 
    data_type='S3Prefix',
    content_type='text/csv', 
    split_type='Line'
)
transformer.wait()

## end aws: create an awstransformer to run batches of input throught the model

## testing

In [None]:
# create Transformer to run our batch job
#ss try multirecord --not yet
batch_job = huggingface_model.transformer(
    instance_count=1,
    instance_type='ml.p3.2xlarge',
    output_path=output_s3_path, # we are using the same s3 path to save the output with the input
    strategy='MultiRecord')

In [None]:
x = validation_dataset["text"].to_csv(index=False,header=False)
#x="inputs\n" + x
buffer = io.StringIO(x)

xdf = pd.read_csv(filepath_or_buffer = buffer, header = 0)
xdf.to_csv("test.csv",index=False,header=1)

In [None]:
with open("test.csv", "r+") as infile, open("test.jsonl", "w+") as outfile:
    reader = csv.reader(infile)
    ro = '{"inputs": ['
    for row in reader:
        # remove @
        #row["inputs"] = row["inputs"].replace("@","")
        ro =ro +'"'+row[0]+'"'+"\n"
    
    ro = ro + "]}"
with  open("test.jsonl", "w+") as outfile:
    json.dump(ro, outfile)
    outfile.write('\n')


In [None]:
# we used train and test to fit so use validation
def dataset_label_int2str(row):
    return validation_dataset.features["labels"].int2str(row)


In [None]:
validation_dataset.set_format(type="pandas")
df = validation_dataset[:]


df["label_name"] = df["labels"].apply(dataset_label_int2str)
df.head()

In [None]:
input_list = validation_dataset["text"].to_list()

In [None]:
# too much input results in error: https://bit.ly/3hrdWBu
input_list = validation_dataset["text"].to_list()
#inputs= {"inputs":"this is bullshit"}
inputs= {"inputs":input_list[:1500]}
preds =predictor.predict(inputs)
preds