[![AWS Data Wrangler](_static/logo.png "AWS Data Wrangler")](https://github.com/awslabs/aws-data-wrangler)

# PyTorch

## Table of Contents
* [1.Defining Training Function](#1.-Defininf-Training-Function)
* [2.Traning From Amazon S3](#1.-Traning-From-Amazon-S3)
	* [2.1 Writing PyTorch Dataset to S3](#1.1-Writing-PyTorch-Dataset-to-S3)
	* [2.2 Training Network](#1.2-Training-Network)
* [3. Training From SQL Query](#2.-Training-From-SQL-Query)
	* [3.1 Writing Data to SQL Database](#2.1-Writing-Data-to-SQL-Database)
	* [3.3 Training Network From SQL](#2.2-Reading-single-JSON-file)
* [4. Creating Custom S3 Dataset](#1.-Creating-Custom-S3-Dataset)
	* [4.1 Creating Custom PyTorch Dataset](#1.1-Creating-Custom-PyTorch-Dataset)
	* [4.2 Writing Data to S3](#1.1-Writing-Data-to-S3)
	* [4.3 Training Network](#1.2-Training-Network)
* [5. Delete objects](#6.-Delete-objects)

In [1]:
import io
import boto3
import torch
import torchvision
import awswrangler as wr

from torch.optim import SGD
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader

In [2]:
import getpass
bucket = getpass.getpass()

# 1. Defining Training Function

In [4]:
def train(model, dataset, batch_size=64, epochs=2, device='cpu'):

    criterion = CrossEntropyLoss().to(device)
    opt = SGD(model.parameters(), 0.025)
    loader = DataLoader(dataset, batch_size=batch_size, num_workers=1)

    for epoch in range(epochs):

        correct = 0    
        model.train()
        for i, (inputs, labels) in enumerate(loader):

            # Forward Pass
            outputs = model(inputs)
            
            # Backward Pass
            loss = criterion(outputs, labels)
            loss.backward()
            opt.step()
            opt.zero_grad()
            
            # Accuracy
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()
            accuracy = 100 * correct / ((i+1) * batch_size)

            print(f'batch: {i} loss: {loss.mean().item():.4f} acc: {accuracy:.2f}')   

# 2. Traning From Amazon S3

In [5]:
client_s3 = boto3.client("s3")
folder = "tutorial_torch_dataset"

wr.s3.delete_objects(f"s3://{bucket}/{folder}")
for i in range(3):
    batch = (
        torch.randn(100, 3, 32, 32),
        torch.randint(2, size=(100,)),
    )
    buff = io.BytesIO()
    torch.save(batch, buff)
    buff.seek(0)
    client_s3.put_object(
        Body=buff.read(),
        Bucket=bucket,
        Key=f"{folder}/file{i}.pt",
    )

## 2.2 Training Network

In [6]:
train(
    torchvision.models.resnet18(),
    wr.torch.S3IterableDataset(path=f"{bucket}/{folder}")
)

batch: 0 loss: 6.9552 acc: 0.00
batch: 1 loss: 2.9621 acc: 23.44
batch: 2 loss: 0.9873 acc: 31.77
batch: 3 loss: 1.9760 acc: 34.38
batch: 4 loss: 3.3523 acc: 33.44
batch: 0 loss: 1.2023 acc: 59.38
batch: 1 loss: 0.8057 acc: 60.16
batch: 2 loss: 0.6782 acc: 62.50
batch: 3 loss: 0.4291 acc: 67.58
batch: 4 loss: 0.2953 acc: 66.88


# 2. Training Directly From SQL Query

## 2.1 Writing Data to SQL Database

In [None]:
eng = wr.catalog.get_engine("aws-data-wrangler-redshift")
df = pd.DataFrame({
    "height": [2, 1.4, 1.7, 1.8, 1.9],
    "name": ["foo", "boo"],
    "target": [1, 0, 0, 1, 2, 3]
})

wr.db.to_sql(
    df,
    eng_redshift,
    schema="public",
    name="torch",
    if_exists="replace",
    index=False
)

## 2.2 Training Network From SQL

In [None]:
train(
    model = torch.nn.Sequential(
        torch.nn.Linear(, 20),
        torch.nn.ReLU(),
        torch.nn.Linear(20, 2),    
    ),
    wr.torch.SQLDataset(
        sql="SELECT * FROM public.torch"
        con=eng
        label_col="target",
        chunksize=100
    )
)

# 3. Delete Objects

In [None]:
wr.s3.delete_objects(f"s3://{bucket}/")