In [None]:
import boto3
import sagemaker

session = boto3.session.Session()
aws_region = session.region_name
s3_bucket  =  # s3 bucket name

try:
    s3_client = boto3.client('s3')
    response = s3_client.get_bucket_location(Bucket=s3_bucket)
    print(f"Bucket region: {response['LocationConstraint']}")
except:
    print(f"Access Error: Check if '{s3_bucket}' S3 bucket is in '{aws_region}' region")

In [None]:
s3_prefix = "models/word2vec/dbpedia/v1"
s3_output_location = f"s3://{s3_bucket}/{s3_prefix}"
print(f"Model output location:{s3_output_location}")

In [None]:
container = sagemaker.image_uris.retrieve("blazingtext", aws_region, "1")
print(f"Using SageMaker BlazingText container: {container} ({aws_region})")

In [None]:
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
print(role)

bt_model = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.c5.4xlarge",
    volume_size=100,
    max_run=360000,
    input_mode="File",
    output_path=s3_output_location,
    sagemaker_session=sess,
)

In [None]:
bt_model.set_hyperparameters(
    mode="skipgram",
    epochs=20,
    min_count=5,
    sampling_threshold=0.0001,
    learning_rate=0.05,
    window_size=5,
    vector_dim=150,
    negative_samples=5,
    evaluation=True,  # Perform similarity evaluation on WS-353 dataset at the end of training
    subwords=True,
)

In [None]:
from sagemaker.inputs import TrainingInput
s3_train = f"s3://{s3_bucket}/blazing-text/word2vec/dbpedia"

train_input = TrainingInput(s3_data=s3_train, 
                            distribution="FullyReplicated", 
                            s3_data_type="S3Prefix", 
                            input_mode="File")

data_channels = {"train": train_input}

In [None]:
bt_model.fit(inputs=data_channels, logs="All", wait=True)

In [None]:
bt_endpoint = bt_model.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge")

In [None]:
import json

words = ["extraordinary", "amazing"]

payload = {"instances": words}

response = bt_endpoint.predict(
    json.dumps(payload),
    initial_args={"ContentType": "application/json", "Accept": "application/json"},
)

vecs = json.loads(response)
print(vecs)

In [None]:
s3 = boto3.resource("s3")

key = bt_model.model_data[bt_model.model_data.find("/", 5) + 1 :]
s3.Bucket(s3_bucket).download_file(key, "model.tar.gz")

In [None]:
!tar -xvzf model.tar.gz

In [None]:
!cat eval.json

In [None]:
import numpy as np
from sklearn.preprocessing import normalize

# Read the 400 most frequent word vectors. The vectors in the file are in descending order of frequency.
num_points = 400

first_line = True
index_to_word = []
with open("vectors.txt", "r") as f:
    for line_num, line in enumerate(f):
        if first_line:
            dim = int(line.strip().split()[1])
            word_vecs = np.zeros((num_points, dim), dtype=float)
            first_line = False
            continue
        line = line.strip()
        word = line.split()[0]
        vec = word_vecs[line_num - 1]
        for index, vec_val in enumerate(line.split()[1:]):
            vec[index] = float(vec_val)
        index_to_word.append(word)
        if line_num >= num_points:
            break
word_vecs = normalize(word_vecs, copy=False, return_norm=False)

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(perplexity=40, n_components=2, init="pca", n_iter=10000)
two_d_embeddings = tsne.fit_transform(word_vecs[:num_points])
labels = index_to_word[:num_points]

In [None]:
from matplotlib import pylab

%matplotlib inline


def plot(embeddings, labels):
    pylab.figure(figsize=(20, 20))
    for i, label in enumerate(labels):
        x, y = embeddings[i, :]
        pylab.scatter(x, y)
        pylab.annotate(
            label, xy=(x, y), xytext=(5, 2), textcoords="offset points", ha="right", va="bottom"
        )
    pylab.show()


plot(two_d_embeddings, labels)

In [None]:
bt_endpoint.delete_endpoint(delete_endpoint_config=True)
bt_endpoint.delete_model()