In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
torch.set_printoptions(4)
from torch import optim
import random
import time

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

#model = BertModel.from_pretrained('..\..\Dump Files\models\BERT_cased_multi')

In [3]:
import io
import os

def load_data(set_name):
    """
    set name: "train", "dev", "test"
    """

# Load train data into variables
    with open(os.path.join("..", "data", "{}.ende.src".format(set_name)), "r", encoding="utf8") as ende_src:
        en_set = ende_src.read().split('\n')
    with open(os.path.join("..", "data", "{}.ende.mt".format(set_name)), "r", encoding="utf8") as ende_mt:
        de_set = ende_mt.read().split('\n')
    

    del en_set[len(en_set)-1]
    del de_set[len(de_set)-1]

    
    return en_set, de_set

def load_scores(set_name):
    
    with open(os.path.join("..", "data", "{}.ende.scores".format(set_name)), "r", encoding="utf8") as ende_scores:
        scores = [float(x) for x in ende_scores.read().split('\n')[:-1]]
        
    #del scores[len(scores)-1]
    print(len(scores))
    
    return scores

In [4]:
en_train, de_train = load_data("train")
scores_train = load_scores("train")

en_val, de_val = load_data("dev")
scores_val = load_scores("dev")

en_test, de_test = load_data("test")

7000
1000


In [5]:
def preprocess(X, tokenizer):
    inputs = []
    max_len_inps = 0

    # Tokenize
    for i in range(len(X)-1):
        seq = X[i][:-1]
        # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
        input_ids = torch.tensor([tokenizer.encode(seq, add_special_tokens=True)])  
        inputs.append(input_ids)
        if input_ids.shape[-1] > max_len_inps:
            max_len_inps = input_ids.shape[-1]
            
    # Convert to tensor
    inp_tensor = torch.zeros((len(X), max_len_inps))
    
    for i in range(len(inputs)):
    # Add tokens
        tokens = inputs[i].squeeze()
        inp_tensor[i, : len(tokens)] = tokens
        
    print(inp_tensor.shape)
    return inp_tensor

In [6]:
en_inputs = preprocess(en_train, tokenizer)
de_inputs = preprocess(de_train, tokenizer)

torch.Size([7000, 65])
torch.Size([7000, 69])


In [7]:
en_inputs_new = torch.zeros(de_inputs.shape)
en_inputs_new[:,:65] = en_inputs
en_inputs_new.shape

torch.Size([7000, 69])

In [8]:
en_val_inputs = preprocess(en_val, tokenizer)
de_val_inputs = preprocess(de_val, tokenizer)

en_val_inputs_new = torch.zeros((1000,de_inputs.shape[1]))
en_val_inputs_new[:,:48] = en_val_inputs
print(en_val_inputs_new.shape)

de_val_inputs_new = torch.zeros((1000, de_inputs.shape[1]))
de_val_inputs_new[:,:54] = de_val_inputs
de_val_inputs_new.shape

torch.Size([1000, 48])
torch.Size([1000, 54])
torch.Size([1000, 69])


torch.Size([1000, 69])

In [9]:
def attention_mask(list_input_tensor):
    list_attention_masks = []
    for input_tensor in list_input_tensor:
        attention_mask = torch.zeros((input_tensor.shape))
        attention_mask[input_tensor != 0] = 1
        list_attention_masks.append(attention_mask)

    return list_attention_masks

def get_batches(inp_tensor, scores, BATCH_N):
    inp_tensor = inp_tensor
    scores = torch.tensor(scores).view(-1,1)

    # Split data into train, test and val batches
    inp_tensor_batches = torch.split(inp_tensor, BATCH_N)
    scores_batches = torch.split(scores, BATCH_N)
#     X_train_val, X_test, y_train_val, y_test = train_test_split(inp_tensor_batches, scores_batches, shuffle=False, test_size=0.1)
#     X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, shuffle = False, test_size=1/9)
    
    
    #print(len(inp_tensor_batches), inp_tensor_batches[0].shape)

    # Create attention masks
    X_mask = attention_mask(inp_tensor_batches)
    #print(len(X_mask), X_mask[0].shape)

    # Batch X, mask and y together
    batches = [(X, mask, y) for X,mask,y in zip(inp_tensor_batches, X_mask, scores_batches)]


    return batches


In [10]:
def get_sentence_embeddings(batches):
    with torch.no_grad():
        model = model = BertModel.from_pretrained('..\..\Dump Files\models\BERT_cased_multi').to(device='cuda')
        #inp_tensor = inp_tensor.to(device='cuda', dtype=torch.long)


        #batches = torch.split(inp_tensor, 250, dim=0)
        list_bert_embs = []
        for X in batches:
            with torch.no_grad():
                last_hidden_states = model(X[0].type(torch.LongTensor).to('cuda'), X[1].to('cuda'))[0]    # <-- take word embeddings ([1] gives sentence embeddings)

            list_bert_embs.append(last_hidden_states)

            #print(last_hidden_states.shape)

            torch.cuda.empty_cache()

        bert_embs = torch.cat(list_bert_embs, dim=0)

        # Now, slice the tensor to only keep the [CLS] embedding

        print(bert_embs.shape)
    
    return bert_embs

In [11]:
USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

#bert_config = 'bert-base-multilingual-cased'
BATCH_N = 25
EPOCHS = 20
LR = 2e-6

# Set seeds
seed_val = 111
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Preprocessing


batches_en_train = get_batches(en_inputs_new, scores_train, BATCH_N)
batches_de_train = get_batches(de_inputs, scores_train, BATCH_N)

batches_en_val = get_batches(en_val_inputs_new, scores_val, BATCH_N)
batches_de_val = get_batches(de_val_inputs_new, scores_val, BATCH_N)

In [12]:
embs_en_train = get_sentence_embeddings(batches_en_train).to('cpu')

torch.Size([7000, 69, 768])


In [13]:
embs_de_train = get_sentence_embeddings(batches_de_train).to('cpu')

torch.Size([7000, 69, 768])


In [14]:
embs_en_val = get_sentence_embeddings(batches_en_val).to('cpu')
embs_de_val = get_sentence_embeddings(batches_de_val).to('cpu')

torch.Size([1000, 69, 768])
torch.Size([1000, 69, 768])


In [15]:
embs_en_train = embs_en_train.view(7000,1,69,768)
embs_de_train = embs_de_train.view(7000,1,69,768)
train_dataset = torch.cat((embs_en_train,embs_de_train), 1)
print(train_dataset.shape)

embs_en_val = embs_en_val.view(1000,1,69,768)
embs_de_val = embs_de_val.view(1000,1,69,768)
val_dataset = torch.cat((embs_en_val,embs_de_val), 1)
print(val_dataset.shape)

torch.Size([7000, 2, 69, 768])
torch.Size([1000, 2, 69, 768])


In [16]:
# convert scores to tensor
torch_scores_train = torch.tensor(scores_train).view(-1,1)
torch_scores_val = torch.tensor(scores_val).view(-1,1)

In [17]:
# Make final dataset
train = (train_dataset.type(torch.LongTensor), torch_scores_train.type(torch.LongTensor))
val = (val_dataset.type(torch.LongTensor), torch_scores_val.type(torch.LongTensor))

In [18]:
train_dataset.dtype

torch.float32

In [19]:
train_data_batches = torch.split(train_dataset, BATCH_N)
train_score_batches = torch.split(torch_scores_train, BATCH_N)
train_batches = [(X, y) for X,y in zip(train_data_batches, train_score_batches)]

In [20]:
val_data_batches = torch.split(val_dataset, BATCH_N)
val_score_batches = torch.split(torch_scores_val, BATCH_N)
val_batches = [(X, y) for X,y in zip(val_data_batches, val_score_batches)]

In [21]:
train_batches[0][1].shape

torch.Size([25, 1])

In [22]:
#del val_data_batches, val_score_batches, train_data_batches, train_score_batches, train, val, embs_en_train, embs_de_train, embs_en_val, embs_de_val

In [44]:
# define resnet building blocks

class ResidualBlock(nn.Module): 
    def __init__(self, inchannel, outchannel, stride=1): 
        
        super(ResidualBlock, self).__init__() 
        
        self.left = nn.Sequential(nn.Conv2d(inchannel, outchannel, kernel_size=3, 
                                         stride=stride, padding=1, bias=False), 
                                  nn.BatchNorm2d(outchannel), 
                                  nn.ReLU(inplace=True), 
                                  nn.Conv2d(outchannel, outchannel, kernel_size=3, 
                                         stride=1, padding=1, bias=False), 
                                  nn.BatchNorm2d(outchannel)) 
        
        self.shortcut = nn.Sequential() 
        
        if stride != 1 or inchannel != outchannel: 
            
            self.shortcut = nn.Sequential(nn.Conv2d(inchannel, outchannel, 
                                                 kernel_size=1, stride=stride, 
                                                 padding = 0, bias=False), 
                                          nn.BatchNorm2d(outchannel) ) 
            
    def forward(self, x): 
        
        out = self.left(x) 
        
        out += self.shortcut(x) 
        
        out = F.relu(out) 
        
        return out


    
    # define resnet

class ResNet(nn.Module):
    
    def __init__(self, ResidualBlock, num_classes = 10):
        
        super(ResNet, self).__init__()
        
        self.inchannel = 64
        #self.bn1 = self.BatchNorm2d(2)
        self.conv1 = nn.Sequential(nn.Conv2d(2, 64, kernel_size = (5,768), stride = 1, # changed 3 in channels to 2
                                            padding = 1, bias = False), 
                                  nn.BatchNorm2d(64), 
                                  nn.ReLU())
        
        self.layer1 = self.make_layer(ResidualBlock, 64, 2, stride = 1)
        self.layer2 = self.make_layer(ResidualBlock, 128, 2, stride = 2)
        self.layer3 = self.make_layer(ResidualBlock, 256, 2, stride = 2)
        self.layer4 = self.make_layer(ResidualBlock, 512, 2, stride = 2)
        self.maxpool = nn.MaxPool2d(4)
        self.fc = nn.Linear(4608, 1)
        
        
        
    
    def make_layer(self, block, channels, num_blocks, stride):
        
        strides = [stride] + [1] * (num_blocks - 1)
        
        layers = []
        
        for stride in strides:
            
            layers.append(block(self.inchannel, channels, stride))
            
            self.inchannel = channels
            
        return nn.Sequential(*layers)
    
    
    def forward(self, x):
        #print(x.shape)
        x = self.conv1(x)
        #print(x.shape)
        x = self.layer1(x)
        #print(x.shape)
        x = self.layer2(x)
        #print(x.shape)
        x = self.layer3(x)
        #print(x.shape)
        x = self.layer4(x)
        #print(x.shape)
        #x = self.maxpool(x)
        #print(x.shape)
        x = x.view(x.size(0), -1)
        
        #print(x.shape)
        
        x = self.fc(x)
        
        return x
    
    def loss(self, scores, pred_scores):
        rmse = (((pred_scores - scores)**2).mean())**0.5
        return rmse

    def check_r(self, y_pred, y):
        return pearsonr(y_pred.cpu().squeeze(), y.cpu().squeeze())[0]
    
    
def ResNet18():
    return ResNet(ResidualBlock)

### Train ResNet

In [45]:
batch_size = 64

# Manual seeds for reproducibility
torch.manual_seed(0)
np.random.seed(0)

# Define two train sets: one with normal images and the other with
# images horizontally flipped


# Train dataloader
# Concatenate two datasets two increase the size of training set artificially by 2
# This data augmentation is permitted as per Piazza @47
# loader_train = torch.utils.data.DataLoader(
#     train, 
#     batch_size=batch_size, shuffle=True, num_workers=10
#                                            )
# # Test dataloader
# loader_val = torch.utils.data.DataLoader(val, batch_size=batch_size,
#                                          shuffle=False, num_workers=10)



In [55]:
def validation(model, val_batches):

    print("")
    print("Running Validation...")

    t0 = time.time()
    eval_loss = 0
    correlations = []

    for step, batch in enumerate(val_batches):        

        # Untie batch and put on GPU
#         X = batch[0].to(device=device, dtype=torch.long)
#         mask = batch[1].to(device)
#         y = batch[2].to(device)
        X = batch[0].to('cuda')
        y = batch[1].to('cuda')

        with torch.no_grad():        
            y_pred = model(X)                      
                
        # Compute and record batch accuracy
        corr = model.check_r(y_pred, y)
        correlations.append(corr)

    # Report the final accuracy for this validation run.
    print(f"|  Correlation: {np.mean(correlations):.2f}     |")
    print(f"|  Validation took: {time.time() - t0:0f} s |")

    torch.cuda.empty_cache()


def train(model, train_batches, val_batches, epochs):

    losses = []
    
    for epoch_i in range(epochs):
        
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        t0 = time.time()
        total_loss = 0

        for step, batch in enumerate(train_batches):

#             # Untie batch and put on GPU
#             X = X.to(device=device)
#             #mask = batch[1].to(device)
#             y = y.to(device)
            
            X = batch[0].to('cuda')
            y = batch[1].to('cuda')
    
            model.zero_grad()                   # Reset grads
            y_pred = model(X)             # Forward pass
            loss = model.loss(y, y_pred)        # Compute loss
            total_loss += loss.item()           # Accumulate loss
            loss.backward()                     # Backward pass

            optimizer.step()                    # Update params

            if step % 140 == 0:
                # Progress update every 40 batches                
                print('  Batch {:>5,}  of  {:>5,}    |    Elapsed: {:.0f}s.'.format(step, len(train_batches), time.time() - t0))
                print(f'  Loss = {loss.item():.2f}')

            torch.cuda.empty_cache()            # Clear GPU cache to avoid memory issues

        # Compute and store avg loss
        avg_train_loss = total_loss / len(X)
        losses.append(avg_train_loss)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:.0f}s".format(time.time() - t0))

        validation(model, val_batches) 

    print("")
    print("Training complete!")


def writeScores(method_name, scores):
    fn = "predictions.txt"
    print("")
    with open(fn, 'w') as output_file:
        for idx,x in enumerate(scores):
            out =  metrics[idx]+":"+str("{0:.2f}".format(x))+"\n"
            print(out)
            # output_file.write(f"{x}\n")


def get_test_preds(model, en_test, de_test):
    torch.cuda.empty_cache()
    # Preprocessing
    preprocessor = Preprocessor(device, bert_config)
    input_tensor = preprocessor.get_input_tensor(en_test, de_test)
    batches = preprocessor.get_test_batches(input_tensor, BATCH_N=32)
    # mask = torch.cat(preprocessor.attention_mask(input_tensor), dim=0).view(input_tensor.shape).to(device).long()
    # print(mask.shape)
    # print(input_tensor.shape)
    y_preds = []
    for batch in batches:
        # Untie batch and put on GPU
        X = batch[0].to(device=device, dtype=torch.long)
        mask = batch[1].to(device)

        model.zero_grad()                   # Reset grads
        y_pred = model(X, mask)             # Forward pass

        y_preds.append(y_pred)

    print(y_preds)
    # writeScores(y_pred)
    # torch.cuda.empty_cache()

In [56]:
resnet=ResNet18().to('cuda')
optimizer = optim.Adam(resnet.parameters())

In [None]:
train(resnet, train_batches, val_batches, 60)


Training...
  Batch     0  of    280    |    Elapsed: 0s.
  Loss = 1.42
  Batch   140  of    280    |    Elapsed: 5s.
  Loss = 0.96

  Average training loss: 11.25
  Training epoch took: 10s

Running Validation...
|  Correlation: -0.06     |
|  Validation took: 0.327296 s |

Training...
  Batch     0  of    280    |    Elapsed: 0s.
  Loss = 1.56
  Batch   140  of    280    |    Elapsed: 5s.
  Loss = 0.65

  Average training loss: 9.95
  Training epoch took: 10s

Running Validation...
|  Correlation: 0.07     |
|  Validation took: 0.325295 s |

Training...
  Batch     0  of    280    |    Elapsed: 0s.
  Loss = 1.51
  Batch   140  of    280    |    Elapsed: 5s.
  Loss = 0.88

  Average training loss: 9.49
  Training epoch took: 10s

Running Validation...
|  Correlation: -0.01     |
|  Validation took: 0.326296 s |

Training...
  Batch     0  of    280    |    Elapsed: 0s.
  Loss = 1.69
  Batch   140  of    280    |    Elapsed: 5s.
  Loss = 0.69

  Average training loss: 9.63
  Training 

In [37]:
#next(iter(loader_train))
torch.cuda.empty_cache()

In [68]:
torch.cuda.empty_cache() 

In [69]:
del model

NameError: name 'model' is not defined