# AMPA2822B Final Project

In [None]:
# Code to set up the assignment
%cd /home/harb/Project-AMPA_2822B

In [None]:
!make

In [None]:
%set_env PYTHONPATH ./python
%set_env NEEDLE_BACKEND nd

In [None]:
import sys
sys.path.append('./python')

In [None]:
# Download the PTB dataset

import urllib.request
import os

!mkdir -p './data/ptb'
# Download Penn Treebank dataset
ptb_data = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb."
for f in ['train.txt', 'test.txt', 'valid.txt']:
    if not os.path.exists(os.path.join('./data/ptb', f)):
        urllib.request.urlretrieve(ptb_data + f, os.path.join('./data/ptb', f))

## Transformers

The famous paper "[Attention Is All You Need](https://arxiv.org/abs/1706.03762)" (Vaswani et al. 2017) came out in 2017. Since then, Transformers, a model architecture introduced in the aforementioned paper, have become the standard and most performant class of model on language tasks. 

![model](https://miro.medium.com/v2/1*ZCFSvkKtppgew3cc7BIaug.png)

The above is a photo of the Transformer architecture from Vaswani et al. 2017. The version of the transformer in our implementation is nearly identical, but has layer normalization applied at the start of each residual block (referred to as a [prenorm variant](https://arxiv.org/abs/2002.04745) of the Transformer).

We can train a Transformer language model on the Penn Treebank dataset:

In [None]:
import needle as ndl
sys.path.append('./apps')
from models import LanguageModel
from simple_ml import train_ptb, evaluate_ptb

device = ndl.cpu()
corpus = ndl.data.Corpus("data/ptb")
train_data = ndl.data.batchify(corpus.train, batch_size=256, device=device, dtype="float32")
model = LanguageModel(20, len(corpus.dictionary), hidden_size=32, num_layers=1, seq_model='transformer', seq_len=20, device=device)
train_ptb(model, train_data, seq_len=20, n_epochs=10, device=device, lr=0.003, optimizer=ndl.optim.Adam)
evaluate_ptb(model, train_data, seq_len=20, device=device)