In [1]:
import os, boto3, sagemaker
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sagemaker.session import s3_input, Session
from sagemaker.inputs import TrainingInput
from sagemaker.amazon.amazon_estimator import get_image_uri 
from sagemaker.image_uris import retrieve
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.predictor import csv_serializer

In [2]:
bucket_name = 'cropsuggestionsystem'
my_region = 'us-east-2'

s3 = boto3.resource('s3')
try:
    s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': my_region})
    print('Success: {} bucket has been created'.format((bucket_name)))
except Exception as e:
    print('Error: ',e)

prefix = 'xgboost'
output_path ='s3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

Error:  An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.
s3://cropsuggestionsystem/xgboost/output


In [3]:
!wget https://raw.githubusercontent.com/Gladiator07/Harvestify/master/Data-processed/crop_recommendation.csv

try:
    model_data = pd.read_csv('./crop_recommendation.csv', header= 0, index_col= False)
    model_data['istomato'] = model_data['label'].apply(lambda x: 1 if x == 'lentil' else 0)
    model_data = pd.concat([model_data['istomato'], model_data.drop(['istomato', 'label'], axis=1)], axis=1)
    model_data.reset_index(drop=True, inplace=True)
    model_data = shuffle(model_data)
    print('Success: Dataframe created.')
except Exception as e:
    print('Error: ',e)

train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.8 * len(model_data))])
print(train_data.shape, test_data.shape)

train_data.to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

test_data.to_csv('test.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test = sagemaker.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')

--2021-09-30 23:31:25--  https://raw.githubusercontent.com/Gladiator07/Harvestify/master/Data-processed/crop_recommendation.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 147833 (144K) [text/plain]
Saving to: ‘crop_recommendation.csv.1’


2021-09-30 23:31:25 (5.81 MB/s) - ‘crop_recommendation.csv.1’ saved [147833/147833]

Success: Dataframe created.
(1760, 8) (440, 8)


In [4]:
sess = sagemaker.Session()
role=sagemaker.get_execution_role()
container = retrieve("xgboost", my_region, "latest")

estimator = sagemaker.estimator.Estimator(container,
                                          role=role,
                                          sagemaker_session=sess,
                                          instance_count=1, 
                                          instance_type='ml.m4.xlarge', 
                                          volume_size=5,
                                          output_path=output_path,
                                          use_spot_instances=True,
                                          max_run=300,
                                          max_wait=600)

estimator.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    eval_metric="auc",
    objective="binary:logistic",
    num_round=10,
    rate_drop=0.3,
    tweedie_variance_power=1.4,
)


estimator.fit({'train': s3_input_train,'validation': s3_input_test})

2021-09-30 23:31:26 Starting - Starting the training job...
2021-09-30 23:31:48 Starting - Launching requested ML instancesProfilerReport-1633044686: InProgress
...
2021-09-30 23:32:14 Starting - Preparing the instances for training............
2021-09-30 23:34:22 Downloading - Downloading input data
2021-09-30 23:34:22 Training - Downloading the training image...
2021-09-30 23:34:54 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2021-09-30:23:34:44:INFO] Running standalone xgboost training.[0m
[34m[2021-09-30:23:34:44:INFO] File size need to be processed in the node: 0.13mb. Available memory size in the node: 8391.17mb[0m
[34m[2021-09-30:23:34:44:INFO] Determined delimiter of CSV input is ','[0m
[34m[23:34:44] S3DistributionType set as FullyReplicated[0m
[34m[23:34:44] 1760x7 matrix with 12320 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-09-30:23:34:44:INFO] Determined delimiter of CSV input is

In [5]:
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

-----!

In [6]:
test_data_array = test_data.drop(['istomato'], axis=1).values #load the data into an array
# xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)
print(predictions_array)

cm = pd.crosstab(index=test_data['istomato'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "NotTomato", "Tomato"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("NotTomato", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Tomato", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(440,)
[0.06419537 0.06722168 0.06419537 0.06419537 0.06419537 0.06419537
 0.06419537 0.06419537 0.06419537 0.06419537 0.34863633 0.06419537
 0.06419537 0.06419537 0.06419537 0.06419537 0.06419537 0.06419537
 0.06419537 0.06419537 0.06419537 0.10059974 0.06419537 0.89532226
 0.06722168 0.06419537 0.06419537 0.06419537 0.06419537 0.06419537
 0.06419537 0.06419537 0.06419537 0.06419537 0.06419537 0.06722168
 0.91964841 0.06419537 0.06722168 0.06419537 0.06419537 0.06419537
 0.06722168 0.06419537 0.06419537 0.06419537 0.06722168 0.90180528
 0.06419537 0.06419537 0.06419537 0.06419537 0.06419537 0.06419537
 0.06419537 0.06419537 0.06419537 0.06419537 0.06419537 0.06419537
 0.06419537 0.06419537 0.06419537 0.06419537 0.06419537 0.06419537
 0.06419537 0.06419537 0.06419537 0.06419537 0.8130849  0.06419537
 0.90180528 0.06419537 0.06419537 0.06419537 0.06419537 0.06419537
 0.06419537 0.06419537 0.06419537 0.06419537 0.06419537 0.06419537
 0.06419537 0.89532226 0.06419537 0.50797719 0.06722168

In [7]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


[{'ResponseMetadata': {'RequestId': 'TFAT7TSKHESE88FF',
   'HostId': 'F2ulcplcl0iBczg64klbGPtu6BMiv39nmAQd6S8TzechqUIbnFxyfGSd6Dy59BSlQKKlfZkYCn4=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'F2ulcplcl0iBczg64klbGPtu6BMiv39nmAQd6S8TzechqUIbnFxyfGSd6Dy59BSlQKKlfZkYCn4=',
    'x-amz-request-id': 'TFAT7TSKHESE88FF',
    'date': 'Thu, 30 Sep 2021 23:38:11 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'xgboost/output/xgboost-2021-09-30-23-31-26-067/profiler-output/system/training_job_end.ts'},
   {'Key': 'xgboost/test/test.csv'},
   {'Key': 'xgboost/output/xgboost-2021-09-30-23-31-26-067/output/model.tar.gz'},
   {'Key': 'xgboost/train/train.csv'},
   {'Key': 'xgboost/output/xgboost-2021-09-30-23-31-26-067/profiler-output/system/incremental/2021093023/1633044840.algo-1.json'},
   {'Key': 'xgboost/output/xgboost-2021-09-30-23-31-26-067/profil