# Environment Setup (Python)


In [1]:
!pip install -q condacolab
import condacolab
condacolab.install()

⏬ Downloading https://github.com/jaimergp/miniforge/releases/latest/download/Mambaforge-colab-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:33
🔁 Restarting kernel...


In [1]:
%%writefile environment.yml
channels:
  - default
  - conda-forge
  - pytorch
dependencies:
  - pip>20.1
  - python>=3.7,<3.9
  - pytorch=1.8.0
  - torchtext=0.9
  - pip:
    - torch==1.8
    - transformers==4.11.0
    - torchmetrics==0.5.1
    - spacy==3.1.3
    - pandas==1.3.3
    - pytorch_lightning==1.4.8
    - azureml-core>=1.31.0
    - azureml-mlflow>=1.31.0
    - tqdm>=4.59,<4.60
    - matplotlib==3.4.3

Writing environment.yml


In [2]:
!conda env update -n base -f environment.yml

Collecting package metadata (repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | 

In [3]:
import condacolab
condacolab.check()

✨🍰✨ Everything looks OK!


In [4]:
import pytorch_lightning
from azureml.core.authentication import ServicePrincipalAuthentication

In [18]:
from spacy.cli import download

download("en_core_web_sm")
#python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Local Training

## Data Exploration

Open IMDB dataset and explore some example sentences

In [36]:
from torchtext.legacy.data import Field
from torchtext.legacy.datasets import IMDB
import torch

#dir(Field)
#Field.dtypes

import spacy
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [27]:
from torchtext.legacy.data import Field
from torchtext.legacy.datasets import IMDB
import torch

# Create a simple Field just to be able to read sentences and labels
text = Field(tokenize='spacy', tokenizer_language="en_core_web_sm")
label = Field(dtype=torch.float)
train, test = IMDB.splits(text, label)

Explore data, look at a few sentences in the dataset

In [28]:
print(f'Number of training examples: {len(train)}')
print(f'Number of testing examples: {len(test)}')
print(vars(train.examples[1]))


Number of training examples: 25000
Number of testing examples: 25000
{'text': ['I', 'was', 'so', 'glad', 'I', 'came', 'across', 'this', 'short', 'film', '.', 'I', "'m", 'always', 'so', 'disappointed', 'that', 'short', 'films', 'are', 'hard', 'to', 'come', 'across', ',', 'so', 'when', 'I', 'saw', 'this', 'and', 'saw', 'that', 'it', 'was', 'nominated', 'for', 'the', 'Live', 'Action', 'Short', 'Film', 'at', 'the', 'Academy', 'Awards', ',', 'I', 'was', 'so', 'pleased', 'that', 'I', 'actually', 'had', 'a', 'film', 'that', 'I', 'was', 'rooting', 'for.<br', '/><br', '/>The', 'plot', 'is', 'pretty', 'simple', ',', 'the', 'director', ',', 'writer', ',', 'and', 'star', 'Nacho', 'Vigalondo', 'tried', 'coming', 'up', 'with', 'a', 'reason', 'people', 'would', 'suddenly', 'break', 'out', 'into', 'a', 'song', 'and', 'dance', 'number', 'like', 'they', 'do', 'in', 'movie', 'musicals', '.', 'The', 'result', 'is', 'extremely', 'entertaining', 'and', 'the', 'song', 'is', 'actually', 'really', 'catchy.<br'

## Data Prepration

Write code to tokenize sentences, print a few sample sentences and their tokenization

In [41]:
# Write code
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

def tokenizer(text):
    return [token.text for token in spacy_en.tokenizer(text)]

from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')


## Environment Setup

Write code to tokenize and preprocess data in dataset, split data into train, val and test sets

In [None]:
# Write code
import random
from torchtext.legacy import data
from torchtext.legacy import datasets

text = Field(tokenize='spacy', tokenizer_language="en_core_web_sm", preprocessing=generate_bigrams)
#label = Field(dtype=torch.float)
label = data.LabelField(dtype = torch.float)
train, test = IMDB.splits(text, label)

train_data, valid_data = train.split(split_ratio = 0.8, random_state = random.seed(SEED))

MAX_VOCAB_SIZE = 25_000

text.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

label.build_vocab(train_data)



In [43]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(valid_data)}')
train_data.shape

Number of training examples: 20000
Number of testing examples: 5000


<generator object Dataset.__getattr__ at 0x7f9693a7df50>

## Model Implementation

Write code to create a model for training

In [44]:
# Write code
import torch.nn as nn
import torch.nn.functional as F

class FastSentimentAnalyzer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, text):
        
        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
                
        #embedded = [sent len, batch size, emb dim]
        
        embedded = embedded.permute(1, 0, 2)
        
        #embedded = [batch size, sent len, emb dim]
        
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1) 
        
        #pooled = [batch size, embedding_dim]
                
        return self.fc(pooled)


class RNNSentimentAnalyzer(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

## Training

Write code to train your model

In [49]:
# Write code
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)    


Run Training

In [51]:
from torchtext.legacy import data
import torch.optim as optim
import time

BATCH_SIZE = 64

model = FastSentimentAnalyzer(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test), 
    batch_size = BATCH_SIZE, 
    device = device)

INPUT_DIM = len(text.vocab)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1
PAD_IDX = text.vocab.stoi[text.pad_token]

pretrained_embeddings = text.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = text.vocab.stoi[text.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut3-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

ValueError: ignored

# Azure ML Backed Hyper Experimention (OPTIONAL)

You do not need to complete the following sections, but if you are feeling brave go ahead.  You will need to setup your own Azure subscription, resource group, and AzureML Workspace (with compute clusters)

## Azure Workspace Setup

### Service Prinicple Auth

In [None]:
from azureml.core.authentication import ServicePrincipalAuthentication
from azureml.core import Workspace

sp = ServicePrincipalAuthentication(tenant_id="<tenant_id>", # tenantID
                                    service_principal_id="<client_id>", # clientId
                                    service_principal_password="<pass>")

subscription_id = "<sub_id>"
resource_group = "<resource_group>"
workspace_name = '<ws_name>'

ws = Workspace.get(subscription_id=subscription_id, resource_group=resource_group, name=workspace_name, auth=sp)

## Environment Auth

In [None]:
from azureml.core import Workspace

subscription_id = "<sub_id>"
resource_group = "<resource_group>"
workspace_name = '<ws_name>'

ws = Workspace(subscription_id, resource_group, workspace_name)

## Run on AzureML

Write code to generate a single run, optionally include hyperdrive run

[Here is a helpful tutorial](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/pytorch/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb)



In [None]:
# Write code