In [15]:
import pandas as pd
import numpy as np
import os
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from sklearn.model_selection import train_test_split

In [None]:
# %mkdir ./data-sentiment
# !wget -O ./data-sentiment/aclImdb_v1.tar.gz http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# !tar -zxf ./data-sentiment/aclImdb_v1.tar.gz -C ./data-sentiment

In [None]:
def prep_data(path):
    data,labels={},{}
    
    ## Positive Reviews
    data['pos'],labels['pos']=[],[]
    for filename in os.listdir(path + 'pos'):
        filepath=os.path.join(path,'pos',filename)
        with open(filepath) as f:
            data['pos'].append(f.read())
    labels['pos']=[1]*len(data['pos'])
    
    # Negative Reviews
    data['neg'],labels['neg']=[],[]
    for filename in os.listdir(path + 'neg'):
        filepath=os.path.join(path,'neg',filename)
        with open(filepath) as f:
            data['neg'].append(f.read())
    labels['neg']=[0]*len(data['neg'])
    
    X,y=shuffle(data['pos']+data['neg'],labels['pos']+labels['neg'])
    return X,y

In [None]:
train_X,train_y=prep_data('data-sentiment/aclImdb/train/')
test_X,test_y=prep_data('data-sentiment/aclImdb/test/')

In [None]:
# Remove Regex
# Remove Stopwords
# Count Vectorizer

In [None]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [None]:
import re
from bs4 import BeautifulSoup

def review_to_words(review):
    text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [None]:
processed_data={}
processed_data['train']={}
processed_data['test']={}
processed_data['train']['data']=[review_to_words(review) for review in train_X]
processed_data['train']['labels']=train_y
processed_data['test']['data']=[review_to_words(review) for review in test_X]
processed_data['test']['labels']=test_y

In [None]:
with open('data-sentiment/data-sentiment-processed.pkl','wb') as f:
    pickle.dump(processed_data,f)

In [16]:
with open('data-sentiment/data-sentiment-processed.pkl','rb') as f:
    processed_data=pickle.load(f)

In [None]:
vectorizer=CountVectorizer(max_features=5000,lowercase=True)

In [None]:
train_X=vectorizer.fit_transform([' '.join(review) for review in processed_data['train']['data']]).toarray()
test_X=vectorizer.transform([' '.join(review) for review in processed_data['test']['data']]).toarray()
train_y,test_y=np.array(processed_data['train']['labels']),np.array(processed_data['test']['labels'])
train_X.shape,train_y.shape

In [None]:
# !mkdir data-sentiment/sagemakerready
# !mkdir data-sentiment/sagemakerready/data

In [None]:
# Convert data to make it kosher for sagemaker
X_train,X_val,y_train,y_val=train_test_split(train_X,train_y,test_size=0.33)
X_test=test_X
pd.DataFrame(np.concatenate([y_train.reshape(-1,1),X_train],axis=1)).to_csv('data-sentiment/sagemakerready/data/train.csv',header=False,index=False)
pd.DataFrame(np.concatenate([y_val.reshape(-1,1),X_val],axis=1)).to_csv('data-sentiment/sagemakerready/data/validation.csv',header=False,index=False)
pd.DataFrame(X_test).to_csv('data-sentiment/sagemakerready/data/test.csv',header=False,index=False)

In [1]:
import sagemaker

In [2]:
# !pip uninstall -f sagemaker
# !pip install sagemaker==1.72.0
sagemaker.__version__

'1.72.0'

In [3]:
session=sagemaker.Session()


In [4]:
key_prefix='sentimentanalysis'
session.upload_data('data-sentiment/sagemakerready/data',key_prefix=key_prefix)

's3://sagemaker-us-east-1-032934527328/sentimentanalysis'

In [5]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer
session=sagemaker.Session()
role=get_execution_role()

In [6]:
xgb=sagemaker.estimator.Estimator(image_name=get_image_uri(session.boto_region_name,'xgboost'),
                                 role=role,
                                  train_instance_count=1,
                                  train_instance_type='ml.c5.9xlarge',
                                  train_max_run=3600,
                                  train_max_wait=3600,
                                  train_use_spot_instances=True,
                                  output_path='s3://sagemaker-us-east-1-032934527328/sentimentanalysis/output',
                                  sagemaker_session=session
                                 )

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='1.0-1'. For example:
	get_image_uri(region, 'xgboost', '1.0-1').
Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [7]:
xgb.set_hyperparameters(max_depth=5,
                       eta=0.2,
                       gamma=4,
                       min_child_weight=6,
                       subsample=0.8,
                       objective='binary:logistic',
                       early_stopping_rounds=10,
                       num_round=200)

In [8]:
xgb.fit(
    {
        'train':sagemaker.s3_input('s3://sagemaker-us-east-1-032934527328/sentimentanalysis/train.csv',content_type='csv'),
        'validation':sagemaker.s3_input('s3://sagemaker-us-east-1-032934527328/sentimentanalysis/validation.csv',content_type='csv')
    }

)

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2021-04-12 16:38:27 Starting - Starting the training job...
2021-04-12 16:38:29 Starting - Launching requested ML instances......
2021-04-12 16:39:45 Starting - Preparing the instances for training......
2021-04-12 16:40:56 Downloading - Downloading input data
2021-04-12 16:40:56 Training - Downloading the training image...
2021-04-12 16:41:12 Training - Training image download completed. Training in progress.[34mArguments: train[0m
[34m[2021-04-12:16:41:13:INFO] Running standalone xgboost training.[0m
[34m[2021-04-12:16:41:13:INFO] File size need to be processed in the node: 238.47mb. Available memory size in the node: 62025.11mb[0m
[34m[2021-04-12:16:41:13:INFO] Determined delimiter of CSV input is ','[0m
[34m[16:41:13] S3DistributionType set as FullyReplicated[0m
[34m[16:41:14] 16750x5000 matrix with 83750000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-04-12:16:41:14:INFO] Determined delimiter of CSV input is ','[0m


[34m[16:41:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 8 pruned nodes, max_depth=5[0m
[34m[57]#011train-error:0.140478#011validation-error:0.172242[0m
[34m[16:41:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 2 pruned nodes, max_depth=5[0m
[34m[58]#011train-error:0.139701#011validation-error:0.17103[0m
[34m[16:41:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 10 pruned nodes, max_depth=5[0m
[34m[59]#011train-error:0.138866#011validation-error:0.170303[0m
[34m[16:41:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 26 extra nodes, 12 pruned nodes, max_depth=5[0m
[34m[60]#011train-error:0.137612#011validation-error:0.169697[0m
[34m[16:41:23] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 10 pruned nodes, max_depth=5[0m
[34m[61]#011train-error:0.137254#011validation-error:0.170061[0m
[34m[16:41:24] src/tree/updater_prune.cc:74: tree pruning


2021-04-12 16:41:40 Uploading - Uploading generated training model
2021-04-12 16:41:40 Completed - Training job completed
Training seconds: 55
Billable seconds: 21
Managed Spot Training savings: 61.8%


In [9]:
xgb_transformer=xgb.transformer(instance_count=1,instance_type='ml.m5.large')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


In [10]:
xgb_transformer.transform('s3://sagemaker-us-east-1-032934527328/sentimentanalysis/test.csv',
                          content_type='text/csv',
                          split_type='Line')

In [11]:
xgb_transformer.wait()

.........................[34mArguments: serve[0m
[34m[2021-04-12 16:48:21 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2021-04-12 16:48:21 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-04-12 16:48:21 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2021-04-12 16:48:21 +0000] [20] [INFO] Booting worker with pid: 20[0m
[34m[2021-04-12 16:48:21 +0000] [21] [INFO] Booting worker with pid: 21[0m
  monkey.patch_all(subprocess=True)[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-04-12:16:48:21:INFO] Model loaded successfully for worker : 20[0m
[34m[2021-04-12:16:48:21:INFO] Model loaded successfully for worker : 21[0m
[35mArguments: serve[0m
[35m[2021-04-12 16:48:21 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[35m[2021-04-12 16:48:21 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[35m[2021-04-12 16:48:21 +0000] [1] [INFO] Using worker: gevent[0m
[35m[2021-04-12 16:48:21 +0000] [20] [INFO] Booting worker with pid: 20[0m


[34m[2021-04-12:16:49:19:INFO] Sniff delimiter as ','[0m
[34m[2021-04-12:16:49:19:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-04-12:16:49:19:INFO] Sniff delimiter as ','[0m
[34m[2021-04-12:16:49:19:INFO] Determined delimiter of CSV input is ','[0m
[35m[2021-04-12:16:49:19:INFO] Sniff delimiter as ','[0m
[35m[2021-04-12:16:49:19:INFO] Determined delimiter of CSV input is ','[0m
[35m[2021-04-12:16:49:19:INFO] Sniff delimiter as ','[0m
[35m[2021-04-12:16:49:19:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-04-12:16:49:22:INFO] Sniff delimiter as ','[0m
[34m[2021-04-12:16:49:22:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-04-12:16:49:22:INFO] Sniff delimiter as ','[0m
[34m[2021-04-12:16:49:22:INFO] Determined delimiter of CSV input is ','[0m
[35m[2021-04-12:16:49:22:INFO] Sniff delimiter as ','[0m
[35m[2021-04-12:16:49:22:INFO] Determined delimiter of CSV input is ','[0m
[35m[2021-04-12:16:49:22:INFO] Sniff delimiter 

In [12]:
!aws s3 cp --recursive $xgb_transformer.output_path data-sentiment/

download: s3://sagemaker-us-east-1-032934527328/xgboost-2021-04-12-16-44-23-375/test.csv.out to data-sentiment/test.csv.out


In [17]:
y_test= processed_data['test']['labels']
y_pred=np.loadtxt('data-sentiment/test.csv.out',delimiter=',')
y_pred[y_pred>0.5]=1
y_pred[y_pred<=0.5]=0

In [None]:
y_test= processed_data['test']['labels']
predictions = pd.read_csv(os.path.join('data-sentiment/test.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [None]:
np.mean(y_pred==y_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)