In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from sklearn.model_selection import train_test_split

In [None]:
# %mkdir ./data-sentiment
# !wget -O ./data-sentiment/aclImdb_v1.tar.gz http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# !tar -zxf ./data-sentiment/aclImdb_v1.tar.gz -C ./data-sentiment

In [2]:
def prep_data(path):
    data,labels={},{}
    
    ## Positive Reviews
    data['pos'],labels['pos']=[],[]
    for filename in os.listdir(path + 'pos'):
        filepath=os.path.join(path,'pos',filename)
        with open(filepath) as f:
            data['pos'].append(f.read())
    labels['pos']=[1]*len(data['pos'])
    
    # Negative Reviews
    data['neg'],labels['neg']=[],[]
    for filename in os.listdir(path + 'neg'):
        filepath=os.path.join(path,'neg',filename)
        with open(filepath) as f:
            data['neg'].append(f.read())
    labels['neg']=[0]*len(data['neg'])
    
    X,y=shuffle(data['pos']+data['neg'],labels['pos']+labels['neg'])
    return X,y

In [3]:
train_X,train_y=prep_data('data-sentiment/aclImdb/train/')
test_X,test_y=prep_data('data-sentiment/aclImdb/test/')

In [None]:
# Remove Regex
# Remove Stopwords
# Count Vectorizer

In [4]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
import re
from bs4 import BeautifulSoup

def review_to_words(review):
    text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [6]:
processed_data={}
processed_data['train']={}
processed_data['test']={}
processed_data['train']['data']=[review_to_words(review) for review in train_X]
processed_data['train']['labels']=train_y
processed_data['test']['data']=[review_to_words(review) for review in test_X]
processed_data['test']['labels']=test_y

In [7]:
with open('data-sentiment/data-sentiment-processed.pkl','wb') as f:
    pickle.dump(processed_data,f)

In [2]:
with open('data-sentiment/data-sentiment-processed.pkl','rb') as f:
    processed_data=pickle.load(f)

In [9]:
vectorizer=CountVectorizer(max_features=5000,lowercase=True)

In [10]:
train_X=vectorizer.fit_transform([' '.join(review) for review in processed_data['train']['data']]).toarray()
test_X=vectorizer.fit_transform([' '.join(review) for review in processed_data['test']['data']]).toarray()
train_y,test_y=np.array(processed_data['train']['labels']),np.array(processed_data['test']['labels'])
train_X.shape,train_y.shape

((25000, 5000), (25000,))

In [17]:
# !mkdir data-sentiment/sagemakerready
# !mkdir data-sentiment/sagemakerready/data

In [11]:
# Convert data to make it kosher for sagemaker
X_train,X_val,y_train,y_val=train_test_split(train_X,train_y,test_size=0.33)
X_test=test_X
pd.DataFrame(np.concatenate([y_train.reshape(-1,1),X_train],axis=1)).to_csv('data-sentiment/sagemakerready/data/train.csv',header=False,index=False)
pd.DataFrame(np.concatenate([y_val.reshape(-1,1),X_val],axis=1)).to_csv('data-sentiment/sagemakerready/data/validation.csv',header=False,index=False)
pd.DataFrame(X_test).to_csv('data-sentiment/sagemakerready/data/test.csv',header=False,index=False)

In [3]:
import sagemaker

In [4]:
# !pip uninstall -f sagemaker
# !pip install sagemaker==1.72.0
sagemaker.__version__

'1.72.0'

In [5]:
session=sagemaker.Session()


In [6]:
key_prefix='sentimentanalysis'
session.upload_data('data-sentiment/sagemakerready/data',key_prefix=key_prefix)

's3://sagemaker-us-east-1-032934527328/sentimentanalysis'

In [7]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer
session=sagemaker.Session()
role=get_execution_role()

In [15]:
xgb=sagemaker.estimator.Estimator(image_name=get_image_uri(session.boto_region_name,'xgboost'),
                                 role=role,
                                  train_instance_count=1,
                                  train_instance_type='ml.c5.9xlarge',
                                  train_max_run=3600,
                                  train_max_wait=3600,
                                  train_use_spot_instances=True,
                                  output_path='s3://sagemaker-us-east-1-032934527328/sentimentanalysis/output',
                                  sagemaker_session=session
                                 )

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='1.0-1'. For example:
	get_image_uri(region, 'xgboost', '1.0-1').
Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [16]:
xgb.set_hyperparameters(max_depth=5,
                       eta=0.2,
                       gamma=4,
                       min_child_weight=6,
                       subsample=0.8,
                       objective='binary:logistic',
                       early_stopping_rounds=10,
                       num_round=200)

In [17]:
xgb.fit(
    {
        'train':sagemaker.s3_input('s3://sagemaker-us-east-1-032934527328/sentimentanalysis/train.csv',content_type='csv'),
        'validation':sagemaker.s3_input('s3://sagemaker-us-east-1-032934527328/sentimentanalysis/validation.csv',content_type='csv')
    }

)

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2021-04-11 01:03:14 Starting - Starting the training job...
2021-04-11 01:03:18 Starting - Launching requested ML instances......
2021-04-11 01:04:29 Starting - Preparing the instances for training......
2021-04-11 01:05:35 Downloading - Downloading input data
2021-04-11 01:05:35 Training - Downloading the training image..[34mArguments: train[0m
[34m[2021-04-11:01:05:50:INFO] Running standalone xgboost training.[0m
[34m[2021-04-11:01:05:50:INFO] File size need to be processed in the node: 238.47mb. Available memory size in the node: 62014.71mb[0m
[34m[2021-04-11:01:05:50:INFO] Determined delimiter of CSV input is ','[0m
[34m[01:05:50] S3DistributionType set as FullyReplicated[0m
[34m[01:05:52] 16750x5000 matrix with 83750000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-04-11:01:05:52:INFO] Determined delimiter of CSV input is ','[0m
[34m[01:05:52] S3DistributionType set as FullyReplicated[0m
[34m[01:05:52] 8250x5000 ma


2021-04-11 01:05:50 Training - Training image download completed. Training in progress.[34m[01:06:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 26 extra nodes, 8 pruned nodes, max_depth=5[0m
[34m[49]#011train-error:0.142985#011validation-error:0.178182[0m
[34m[01:06:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 10 pruned nodes, max_depth=5[0m
[34m[50]#011train-error:0.141851#011validation-error:0.178545[0m
[34m[01:06:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 4 pruned nodes, max_depth=5[0m
[34m[51]#011train-error:0.14209#011validation-error:0.177576[0m
[34m[01:06:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 16 extra nodes, 4 pruned nodes, max_depth=5[0m
[34m[52]#011train-error:0.141612#011validation-error:0.17697[0m
[34m[01:06:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 8 pruned nodes, max_depth=5[0m
[34m[53]#011train-error:0.14006#011vali

[34m[01:06:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 6 pruned nodes, max_depth=5[0m
[34m[122]#011train-error:0.105313#011validation-error:0.154788[0m
[34m[01:06:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 4 pruned nodes, max_depth=5[0m
[34m[123]#011train-error:0.105672#011validation-error:0.154667[0m
[34m[01:06:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 20 extra nodes, 14 pruned nodes, max_depth=5[0m
[34m[124]#011train-error:0.105313#011validation-error:0.153697[0m
[34m[01:06:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 8 pruned nodes, max_depth=5[0m
[34m[125]#011train-error:0.105194#011validation-error:0.154303[0m
[34m[01:06:10] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 8 pruned nodes, max_depth=5[0m
[34m[126]#011train-error:0.104358#011validation-error:0.154303[0m
[34m[01:06:10] src/tree/updater_prune.cc:74: tree pru

[34m[196]#011train-error:0.08609#011validation-error:0.144364[0m
[34m[01:06:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 18 extra nodes, 12 pruned nodes, max_depth=5[0m
[34m[197]#011train-error:0.08609#011validation-error:0.14497[0m
[34mStopping. Best iteration:[0m
[34m[187]#011train-error:0.088179#011validation-error:0.142667
[0m

2021-04-11 01:06:28 Uploading - Uploading generated training model
2021-04-11 01:06:28 Completed - Training job completed
Training seconds: 67
Billable seconds: 27
Managed Spot Training savings: 59.7%


In [18]:
xgb_transformer=xgb.transformer(instance_count=1,instance_type='ml.m5.large')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


In [19]:
xgb_transformer.transform('s3://sagemaker-us-east-1-032934527328/sentimentanalysis/test.csv',
                          content_type='text/csv',
                          split_type='Line')

In [20]:
xgb_transformer.wait()

..........................[34mArguments: serve[0m
[34m[2021-04-11 01:11:58 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2021-04-11 01:11:58 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-04-11 01:11:58 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2021-04-11 01:11:58 +0000] [20] [INFO] Booting worker with pid: 20[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-04-11:01:11:58:INFO] Model loaded successfully for worker : 20[0m
[34m[2021-04-11 01:11:58 +0000] [21] [INFO] Booting worker with pid: 21[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-04-11:01:11:58:INFO] Model loaded successfully for worker : 21[0m
[35mArguments: serve[0m
[35m[2021-04-11 01:11:58 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[35m[2021-04-11 01:11:58 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[35m[2021-04-11 01:11:58 +0000] [1] [INFO] Using worker: gevent[0m
[35m[2021-04-11 01:11:58 +0000] [20] [INFO] Booting worker with pid: 20[0m

[34m[2021-04-11:01:12:54:INFO] Sniff delimiter as ','[0m
[34m[2021-04-11:01:12:54:INFO] Determined delimiter of CSV input is ','[0m
[35m[2021-04-11:01:12:54:INFO] Sniff delimiter as ','[0m
[35m[2021-04-11:01:12:54:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-04-11:01:12:55:INFO] Sniff delimiter as ','[0m
[34m[2021-04-11:01:12:55:INFO] Determined delimiter of CSV input is ','[0m
[35m[2021-04-11:01:12:55:INFO] Sniff delimiter as ','[0m
[35m[2021-04-11:01:12:55:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-04-11:01:12:57:INFO] Sniff delimiter as ','[0m
[34m[2021-04-11:01:12:57:INFO] Determined delimiter of CSV input is ','[0m
[35m[2021-04-11:01:12:57:INFO] Sniff delimiter as ','[0m
[35m[2021-04-11:01:12:57:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-04-11:01:12:59:INFO] Sniff delimiter as ','[0m
[34m[2021-04-11:01:12:59:INFO] Determined delimiter of CSV input is ','[0m
[35m[2021-04-11:01:12:59:INFO] Sniff delimiter 

In [21]:
!aws s3 cp --recursive $xgb_transformer.output_path data-sentiment/

download: s3://sagemaker-us-east-1-032934527328/xgboost-2021-04-11-01-07-46-850/test.csv.out to data-sentiment/test.csv.out


In [22]:
y_test= processed_data['test']['labels']
y_pred=np.loadtxt('data-sentiment/test.csv.out',delimiter=',')
y_pred[y_pred>0.5]=1
y_pred[y_pred<=0.5]=0

NameError: name 'processed_data' is not defined

In [4]:
y_test= processed_data['test']['labels']
predictions = pd.read_csv(os.path.join('data-sentiment/test.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [28]:
np.mean(y_pred==y_test)

0.604

In [6]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.53508