In [1]:
import pandas as pd
import boto3
import sagemaker

In [2]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# create an S3 bucket
bucket = sagemaker_session.default_bucket()

In [3]:
print(bucket)

sagemaker-us-east-2-212613453703


In [4]:
data_dir = 'plagiarism_data'
prefix = 'plagiarism-data'
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)
print(input_data)

s3://sagemaker-us-east-2-212613453703/plagiarism-data


#### Test cell


In [5]:
empty_check = []
for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    empty_check.append(obj.key)
    print(obj.key)

assert len(empty_check) !=0, 'S3 bucket is empty.'
print('Test passed!')

plagiarism-data/test.csv
plagiarism-data/train.csv
Test passed!


In [6]:
!pygmentize source_sklearn/train.py

[34mfrom[39;49;00m [04m[36m__future__[39;49;00m [34mimport[39;49;00m print_function

[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m

[34mfrom[39;49;00m [04m[36msklearn.externals[39;49;00m [34mimport[39;49;00m joblib

[37m## TODO: Import any additional libraries you need to define a model[39;49;00m


[37m# Provided model load function[39;49;00m
[34mdef[39;49;00m [32mmodel_fn[39;49;00m(model_dir):
    [33m"""Load model from the model_dir. This is the same model that is saved[39;49;00m
[33m    in the main if statement.[39;49;00m
[33m    """[39;49;00m
    [34mprint[39;49;00m([33m"[39;49;00m[33mLoading model.[39;49;00m[33m"[39;49;00m)
    
    [37m# load using joblib[39;49;00m
    model = joblib.load(os.path.join(model_dir, [33m"[39;49;00m[33mmodel.joblib[39;49;00m[33m"[39;49;00m))
    [34mprint[39;49

In [6]:
from sagemaker.sklearn.estimator import SKLearn

estimator = SKLearn(entry_point="train.py",
                    source_dir="source_sklearn",
                    role=role,
                    train_instance_count=1,
                    train_instance_type='ml.c4.xlarge')

In [7]:
%%time
estimator.fit({'train': input_data})

2019-12-02 19:07:07 Starting - Starting the training job...
2019-12-02 19:07:08 Starting - Launching requested ML instances...
2019-12-02 19:08:06 Starting - Preparing the instances for training......
2019-12-02 19:09:01 Downloading - Downloading input data
2019-12-02 19:09:01 Training - Downloading the training image..[31m2019-12-02 19:09:16,454 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[31m2019-12-02 19:09:16,456 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[31m2019-12-02 19:09:16,466 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[31m2019-12-02 19:09:16,742 sagemaker-containers INFO     Module train does not provide a setup.py. [0m
[31mGenerating setup.py[0m
[31m2019-12-02 19:09:16,743 sagemaker-containers INFO     Generating setup.cfg[0m
[31m2019-12-02 19:09:16,743 sagemaker-containers INFO     Generating MANIFEST.in[0m
[31m2019-12-02 19:09:16,743 sagema

In [8]:
%%time
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

-------------------------------------------------------------------------------------------------!CPU times: user 519 ms, sys: 28.8 ms, total: 548 ms
Wall time: 8min 9s


In [9]:
import os
test_data = pd.read_csv(os.path.join(data_dir, "test.csv"), header=None, names=None)
test_y = test_data.iloc[:,0]
test_x = test_data.iloc[:,1:]

In [10]:
test_y_preds = predictor.predict(test_x)

assert len(test_y_preds)==len(test_y), 'Unexpected number of predictions.'
print('Test passed!')

Test passed!


In [11]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test_y, test_y_preds)
print(accuracy)

print('\nPredicted class labels: ')
print(test_y_preds)
print('\nTrue class labels: ')
print(test_y.values)

0.96

Predicted class labels: 
[1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 0 1 1 1 1 0 0]

True class labels: 
[1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 0]


In [15]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_y, test_y_preds)

array([[ 9,  1],
       [ 0, 15]])

In [19]:
# tn, fp, fn, tp from confusion matrix
tn, fp, fn, tp = confusion_matrix(test_y, test_y_preds).ravel()
print('False Postives: {}'.format(fp))
print('False Negatives: {}'.format(fn))

False Postives: 1
False Negatives: 0


In [25]:
test_pred_df = pd.concat([test_data,pd.DataFrame(test_y_preds)], axis=1)
test_pred_df.columns = ['true_label', 'c_1', 'c_5', 'lcs_word', 'pred_label']

In [26]:
test_pred_df

Unnamed: 0,true_label,c_1,c_5,lcs_word,pred_label
0,1,1.0,0.92228,0.820755,1
1,1,0.765306,0.589655,0.621711,1
2,1,0.884444,0.180995,0.597458,1
3,1,0.619048,0.043243,0.427835,1
4,1,0.92,0.394366,0.775,1
5,1,0.992674,0.973978,0.993056,1
6,0,0.412698,0.0,0.346667,0
7,0,0.462687,0.0,0.18932,0
8,0,0.581152,0.0,0.247423,0
9,0,0.584211,0.0,0.294416,0


In [27]:
predictor.delete_endpoint()

In [28]:
# deleting bucket
bucket_to_delete = boto3.resource('s3').Bucket(bucket)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': '2D71F4D056FD7A94',
   'HostId': 'nN4vcgjy6pRxCFZ+IgsvMUUllXrlOAZ8xDQKxZ/ro28W8hf5VE8wlHSrGkHj6NFeASLbrVmtPXo=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'nN4vcgjy6pRxCFZ+IgsvMUUllXrlOAZ8xDQKxZ/ro28W8hf5VE8wlHSrGkHj6NFeASLbrVmtPXo=',
    'x-amz-request-id': '2D71F4D056FD7A94',
    'date': 'Mon, 02 Dec 2019 20:57:45 GMT',
    'connection': 'close',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'plagiarism-data/test.csv'},
   {'Key': 'plagiarism-data/train.csv'},
   {'Key': 'sagemaker-scikit-learn-2019-12-02-19-07-07-001/output/model.tar.gz'},
   {'Key': 'sagemaker-scikit-learn-2019-12-02-19-07-07-001/source/sourcedir.tar.gz'}]}]