<a href="https://colab.research.google.com/github/akashe/Python-Code-Generation/blob/main/Python_Embeddings_on_CoNaLa_mined_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.legacy import data
from torchtext.legacy.data import Field, BucketIterator

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import spacy
import numpy as np

import random
import math
import time
import pickle
import json

In [2]:
! git clone https://github.com/akashe/Python-Code-Generation

fatal: destination path 'Python-Code-Generation' already exists and is not an empty directory.


In [3]:
import sys
sys.path.append("/content/Python-Code-Generation/")

In [4]:
from data_processing import getTokenizer

In [5]:
# checking CoNALA data

In [6]:
!wget -c "http://www.phontron.com/download/conala-corpus-v1.1.zip"

--2021-03-12 11:25:40--  http://www.phontron.com/download/conala-corpus-v1.1.zip
Resolving www.phontron.com (www.phontron.com)... 208.113.196.149
Connecting to www.phontron.com (www.phontron.com)|208.113.196.149|:80... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [7]:
!unzip -n conala-corpus-v1.1.zip

Archive:  conala-corpus-v1.1.zip


In [8]:
!head -2 conala-corpus/conala-mined.jsonl

{"parent_answer_post_id": 34705233, "prob": 0.8690001442846342, "snippet": "sorted(l, key=lambda x: (-int(x[1]), x[0]))", "intent": "Sort a nested list by two elements", "id": "34705205_34705233_0", "question_id": 34705205}
{"parent_answer_post_id": 13905946, "prob": 0.8526701436370034, "snippet": "[int(x) for x in str(num)]", "intent": "converting integer to list in python", "id": "13905936_13905946_0", "question_id": 13905936}


In [9]:
!head -20 conala-corpus/conala-train.json

[
  {
    "intent": "How to convert a list of multiple integers into a single integer?",
    "rewritten_intent": "Concatenate elements of a list 'x' of multiple integers to a single integer",
    "snippet": "sum(d * 10 ** i for i, d in enumerate(x[::-1]))",
    "question_id": 41067960
  },
  {
    "intent": "How to convert a list of multiple integers into a single integer?",
    "rewritten_intent": "convert a list of integers into a single integer",
    "snippet": "r = int(''.join(map(str, x)))",
    "question_id": 41067960
  },
  {
    "intent": "how to convert a datetime string back to datetime object?",
    "rewritten_intent": "convert a DateTime string back to a DateTime object of format '%Y-%m-%d %H:%M:%S.%f'",
    "snippet": "datetime.strptime('2010-11-13 10:33:54.227806', '%Y-%m-%d %H:%M:%S.%f')",
    "question_id": 4170655
  },
  {


In [10]:
questions, answers = [],[]

In [11]:
# using examples from conala-mined with probability greater than 80%
prob = 0.0
f = open("/content/conala-corpus/conala-mined.jsonl")
for i,line in enumerate(f):
  dict_ = json.loads(line)
  if dict_['prob'] > prob:
    questions.append(dict_['intent'])
    answers.append(dict_['snippet'])


In [12]:
print(f'Total examples from mined examples with probability grater than {prob*100}% is {len(questions)}')

Total examples from mined examples with probability grater than 0.0% is 593891


In [13]:
assert len(questions)==len(answers)

In [14]:
# Setting max word len
max_word_len = 301

In [15]:
# removing examples with len more than max_word_len
pruned_answers = []
for i in answers:
  tokens = getTokenizer(i)
  if not len(tokens) > max_word_len:
    pruned_answers.append(tokens)

print(len(pruned_answers))
answers = pruned_answers

590763


In [16]:
SRC = Field(tokenize = None, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = False, 
            batch_first = True)

In [17]:
fields = [('src', SRC)]

Examples = [data.Example.fromlist([i], fields) for i in answers]

In [18]:
print(len(Examples))
vars(Examples[0])

590763


{'src': ['sorted',
  '(',
  'l',
  ',',
  'key',
  '=',
  'lambda',
  'x',
  ':',
  '(',
  '-',
  'int',
  '(',
  'x',
  '[',
  '1',
  ']',
  ')',
  ',',
  'x',
  '[',
  '0',
  ']',
  ')',
  ')',
  '']}

In [19]:
Dataset = data.Dataset(Examples, fields)

In [20]:
train_data,valid_data = Dataset.split(split_ratio=[0.90,0.10])

In [21]:
SRC.build_vocab(train_data, min_freq = 15)

In [22]:
len(SRC.vocab)

15018

In [23]:
# Dumps dicts
with open("/content/SRC_stio","wb") as f:
  pickle.dump(SRC.vocab.stoi,f)
with open("/content/SRC_itos","wb") as f:
  pickle.dump(SRC.vocab.itos,f)

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [25]:
BATCH_SIZE = 64

train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, valid_data), 
     batch_size = BATCH_SIZE,
     sort_key = lambda x: len(x.src),
     device = device)

In [26]:
class PositionalEncodingComponent(nn.Module):
  '''
  Class to encode positional information to tokens.
  

  '''
  def __init__(self,hid_dim,device,dropout=0.2,max_len=5000):
    super().__init__()

    assert hid_dim%2==0 # If not, it will result error in allocation to positional_encodings[:,1::2] later

    self.dropout = nn.Dropout(dropout)

    self.positional_encodings = torch.zeros(max_len,hid_dim)

    pos = torch.arange(0,max_len).unsqueeze(1) # pos : [max_len,1]
    div_term  = torch.exp(-torch.arange(0,hid_dim,2)*math.log(10000.0)/hid_dim) # Calculating value of 1/(10000^(2i/hid_dim)) in log space and then exponentiating it
    # div_term: [hid_dim//2]

    self.positional_encodings[:,0::2] = torch.sin(pos*div_term) # pos*div_term [max_len,hid_dim//2]
    self.positional_encodings[:,1::2] = torch.cos(pos*div_term) 

    self.positional_encodings = self.positional_encodings.unsqueeze(0) # To account for batch_size in inputs

    self.device = device

  def forward(self,x):
    x = x + self.positional_encodings[:,:x.size(1)].detach().to(self.device)
    return self.dropout(x)

In [27]:
class FeedForwardComponent(nn.Module):
  '''
  Class for pointwise feed forward connections
  '''
  def __init__(self,hid_dim,pf_dim,dropout):
    super().__init__()

    self.dropout = nn.Dropout(dropout)

    self.fc1 = nn.Linear(hid_dim,pf_dim)
    self.fc2 = nn.Linear(pf_dim,hid_dim)

  def forward(self,x):

    # x : [batch_size,seq_len,hid_dim]
    x = self.dropout(torch.relu(self.fc1(x)))

    # x : [batch_size,seq_len,pf_dim]
    x = self.fc2(x)

    # x : [batch_size,seq_len,hid_dim]
    return x

In [28]:
class MultiHeadedAttentionComponent(nn.Module):
  '''
  Multiheaded attention Component. This implementation also supports mask. 
  The reason for mask that in Decoder, we don't want attention mechanism to get
  important information from future tokens.
  '''
  def __init__(self,hid_dim, n_heads, dropout, device):
    super().__init__()

    assert hid_dim % n_heads == 0 # Since we split hid_dims into n_heads

    self.hid_dim = hid_dim
    self.n_heads = n_heads # no of heads in 'multiheaded' attention
    self.head_dim = hid_dim//n_heads # dims of each head

    # Transformation from source vector to query vector
    self.fc_q = nn.Linear(hid_dim,hid_dim)

    # Transformation from source vector to key vector
    self.fc_k = nn.Linear(hid_dim,hid_dim)

    # Transformation from source vector to value vector
    self.fc_v = nn.Linear(hid_dim,hid_dim)

    self.fc_o = nn.Linear(hid_dim,hid_dim)

    self.dropout = nn.Dropout(dropout)

    # Used in self attention for smoother gradients
    self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

  def forward(self,query,key,value,mask=None):

    #query : [batch_size, query_len, hid_dim]
    #key : [batch_size, key_len, hid_dim]
    #value : [batch_size, value_len, hid_dim]

    batch_size = query.shape[0]

    # Transforming quey,key,values
    Q = self.fc_q(query)
    K = self.fc_k(key)
    V = self.fc_v(value)

    #Q : [batch_size, query_len, hid_dim]
    #K : [batch_size, key_len, hid_dim]
    #V : [batch_size, value_len,hid_dim]

    # Changing shapes to acocmadate n_heads information
    Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
    K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
    V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

    #Q : [batch_size, n_heads, query_len, head_dim]
    #K : [batch_size, n_heads, key_len, head_dim]
    #V : [batch_size, n_heads, value_len, head_dim]

    # Calculating alpha
    score = torch.matmul(Q,K.permute(0,1,3,2))/self.scale
    # score : [batch_size, n_heads, query_len, key_len]

    if mask is not None:
      score = score.masked_fill(mask==0,-1e10)

    alpha = torch.softmax(score,dim=-1)
    # alpha : [batch_size, n_heads, query_len, key_len]

    # Get the final self-attention  vector
    x = torch.matmul(self.dropout(alpha),V)
    # x : [batch_size, n_heads, query_len, head_dim]

    # Reshaping self attention vector to concatenate
    x = x.permute(0,2,1,3).contiguous()
    # x : [batch_size, query_len, n_heads, head_dim]

    x = x.view(batch_size,-1,self.hid_dim)
    # x: [batch_size, query_len, hid_dim]

    # Transforming concatenated outputs 
    x = self.fc_o(x)
    #x : [batch_size, query_len, hid_dim] 

    return x, alpha

In [29]:
class DecoderLayer(nn.Module):
  '''
  Operations of a single layer in an Decoder. An Decoder employs multiple such layers. Each layer contains:
  1) masked decoder self attention, followed by
  2) LayerNorm of addition of previous attention output and input to the layer,, followed by
  3) encoder self attention, followed by
  4) LayerNorm of addition of result of encoder self attention and its input, followed by
  5) FeedForward connections, followed by
  6) LayerNorm of addition of Feedforward results and its input.
  '''
  def __init__(self,hid_dim,n_heads,pf_dim,dropout,device):
    super().__init__()

    self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
    self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
    self.ff_layer_norm = nn.LayerNorm(hid_dim)

    # decoder self attention
    self.self_attention = MultiHeadedAttentionComponent(hid_dim,n_heads,dropout,device)

    # FeedForward
    self.feed_forward = FeedForwardComponent(hid_dim,pf_dim,dropout)

    self.dropout = nn.Dropout(dropout)

  def forward(self,trg,trg_mask):

    #trg : [batch_size, trg_len, hid_dim]
    #trg_mask : [batch_size, 1, trg_len, trg_len]

    '''
    Decoder self-attention
    trg_mask is to force decoder to look only into past tokens and not get information from future tokens.
    Since we apply mask before doing softmax, the final self attention vector gets no information from future tokens.
    '''
    _trg, _ = self.self_attention(trg,trg,trg,trg_mask)

    # LayerNorm and dropout with resdiual connection
    trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
    # trg : [batch_size, trg_len, hid_dim]

    # Feed Forward
    _trg = self.feed_forward(trg)

    # LayerNorm, residual connection and dropout
    trg = self.ff_layer_norm(trg + self.dropout(_trg))

    return trg

In [30]:
class Decoder(nn.Module):
  '''
  An decoder, creates token embeddings and position embeddings and passes them through multiple decoder layers
  '''
  def __init__(self,output_dim,hid_dim,n_layers,n_heads,pf_dim,dropout,device,max_length= 350):
    super().__init__()

    self.device = device

    self.tok_embedding = nn.Embedding(output_dim,hid_dim)
    self.pos_embedding = PositionalEncodingComponent(hid_dim,device,dropout,max_length)

    # decoder layers
    self.layers = nn.ModuleList([DecoderLayer(hid_dim,n_heads,pf_dim,dropout,device) for _ in range(n_layers)])

    # convert decoder outputs to real outputs
    self.fc_out = nn.Linear(hid_dim,output_dim)

    self.dropout = nn.Dropout(dropout)

    self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

  def forward(self, trg,trg_mask):
    
    #trg : [batch_size, trg_len]
    #trg_mask : [batch_size, 1, trg_len, trg_len]

    batch_size = trg.shape[0]
    trg_len = trg.shape[1]

    tok_embeddings = self.tok_embedding(trg)*self.scale

    # token plus pos embeddings
    trg = self.pos_embedding(tok_embeddings)
    # trg : [batch_size, trg_len, hid_dim]

    # Pass trg thorugh decoder layers
    for layer in self.layers:
      trg= layer(trg,trg_mask)
    
    # trg : [batch_size,trg_len,hid_dim]

    # Convert to outputs
    output = self.fc_out(trg)
    # output : [batch_size, trg_len, output_dim]
    
    return output

In [31]:
class TrainEmbeddings(nn.Module):
  def __init__(self, decoder, trg_pad_idx, device):
    super().__init__()
    self.decoder = decoder
    self.trg_pad_idx = trg_pad_idx
    self.device = device

  def make_trg_mask(self,trg):
    # trg : [batch_size, trg_len]

    # Masking pad values
    trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
    # trg_pad_mask : [batch_size,1,1, trg_len]

    # Masking future values
    trg_len = trg.shape[1]
    trg_sub_mask = torch.tril(torch.ones((trg_len,trg_len),device= self.device)).bool()
    # trg_sub_mask : [trg_len, trg_len]

    # combine both masks
    trg_mask = trg_pad_mask & trg_sub_mask
    # trg_mask = [batch_size,1,trg_len,trg_len]

    return trg_mask

  def forward(self,trg):

    # trg : [batch_size, trg_len]

    trg_mask = self.make_trg_mask(trg)

    # trg_mask : [batch_size, 1, trg_len, trg_len]

    output = self.decoder(trg,trg_mask)
    # output : [batch_size, trg_len, output_dim]

    return output

In [32]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(SRC.vocab)
HID_DIM = 256
DEC_LAYERS = 2
DEC_HEADS = 8
DEC_PF_DIM = 256
DEC_DROPOUT = 0.1

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

TRG_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]

model = TrainEmbeddings( dec, TRG_PAD_IDX, device).to(device)

In [33]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

model.apply(initialize_weights);

In [34]:
# # Loading trained model
# trained_model = 'python_embedding.pt'
# model.load_state_dict(torch.load(trained_model))

In [35]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 8,496,810 trainable parameters


In [36]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [37]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = src
        
        optimizer.zero_grad()
        
        output = model(src)
                
        #output = [batch size, trg len, output dim]
        #trg = [batch size, trg len]
   
        output_dim = output.shape[-1]

        output = output[:,:-1,:]             
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
                
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        loss = criterion(output, trg)
        
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [38]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = src

            output = model(src)
            
            #output = [batch size, trg len, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output[:,:-1,:] 
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [39]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'python_embedding.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 28m 58s
	Train Loss: 2.216 | Train PPL:   9.173
	 Val. Loss: 1.825 |  Val. PPL:   6.203
Epoch: 02 | Time: 28m 59s
	Train Loss: 1.776 | Train PPL:   5.907
	 Val. Loss: 1.722 |  Val. PPL:   5.595
Epoch: 03 | Time: 29m 0s
	Train Loss: 1.668 | Train PPL:   5.299
	 Val. Loss: 1.676 |  Val. PPL:   5.342
Epoch: 04 | Time: 29m 2s
	Train Loss: 1.611 | Train PPL:   5.008
	 Val. Loss: 1.646 |  Val. PPL:   5.186
