In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# reference link https://github.com/nidhikowtal/BE_Project/blob/3454ce196b7350166cca9722768100992e7031b9/ml_models/code2pseudocode.ipynb#L4
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# pip install datasets

In [3]:
from datasets import load_dataset

dataset = load_dataset("AhmedSSoliman/DJANGO")

Repo card metadata block was not found. Setting CardData to empty.


In [4]:
# 파일 불러오기
with open('../../data/q_src_df.pkl', 'rb') as f:
    q_src_df = pickle.load(f)

# 파일 불러오기
with open('../../data/a_src_df.pkl', 'rb') as f:
    a_src_df = pickle.load(f)


In [5]:
# pip install transformers

In [6]:
# pip install torch

In [7]:
import transformers
import torch

In [8]:
dataset = dataset.remove_columns('Unnamed: 0')
dataset = dataset.rename_column('nl', 'pseudocode')

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['pseudocode', 'code'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['pseudocode', 'code'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['pseudocode', 'code'],
        num_rows: 1805
    })
})

In [10]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')

def preprocess_data(examples):
    pseudocode = examples["pseudocode"]
    code = examples["code"]
    prefix = "Generate Pseudocode: "
    inputs = [prefix + c for c in code]
    model_inputs = tokenizer(inputs, max_length = 64, padding="max_length", truncation=True)
    labels = tokenizer(pseudocode, max_length=64, padding="max_length", truncation=True).input_ids
    
    #Replace padding token ids with -100 so that they are not taken into account by the loss function
    labels_with_ignore_index = []
    for labels_example in labels:
        labels_example = [label if label != 0 else -100 for label in labels_example]
        labels_with_ignore_index.append(labels_example)
    model_inputs["labels"] = labels_with_ignore_index
    
    return model_inputs
    

tokenized_dataset = dataset.map(preprocess_data, batched=True)

In [11]:
tokenized_dataset.set_format('torch')

In [12]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['pseudocode', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['pseudocode', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['pseudocode', 'code', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1805
    })
})

In [13]:
tokenized_dataset = tokenized_dataset.remove_columns(["pseudocode", "code"])
# small_train_dataset = tokenized_dataset['train'].shuffle(seed=42).select(range(2000))
# small_validation_dataset = tokenized_dataset['validation'].shuffle(seed=42).select(range(200))
small_train_dataset = tokenized_dataset['train']
small_validation_dataset = tokenized_dataset['validation']

In [14]:
small_train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 16000
})

In [15]:
#create torch dataloaders
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=32)
eval_dataloader = DataLoader(small_validation_dataset, shuffle=True, batch_size=32)

In [16]:
from transformers import T5ForConditionalGeneration
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

In [17]:
import math
from transformers import AdamW, get_scheduler

optimizer = AdamW(
    model.parameters(), 
    lr=1e-4,
    weight_decay = 1e-4,
    eps = 1e-8
)

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
warmup_ratio = 0.2
num_warmup_steps = math.ceil(num_training_steps * warmup_ratio)
lr_scheduler = get_scheduler(
    "linear",
    optimizer,
    num_warmup_steps,
    num_training_steps
)



In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [19]:
for i in train_dataloader:
    for x, y in i.items():
        print(type(y))
        break
    break

<class 'torch.Tensor'>


In [20]:
def eval_loss(model):
    model.eval()
    total_loss = 0
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss
    return total_loss / len(eval_dataloader)
    

In [21]:
def checkpoint(model, filename):
    torch.save(model.state_dict(), filename)
    
def resume(model, filename):
    model.load_state_dict(torch.load(filename))

In [22]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

train_loss = []

early_stop_threshold = 4
min_loss = 100
best_epoch = 0

# for epoch in range(1, num_epochs+1):
#     model.train()
#     print(f"EPOCH {epoch}")
#     for batch in train_dataloader:
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         loss = outputs.loss
#         print(f"training loss: {loss}")
#         train_loss.append(loss)
#         loss.backward()

#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
#         progress_bar.update(1)
#     current_eval_loss = eval_loss(model)
#     print(type(current_eval_loss))
#     print(type(min_loss))
#     print("Eval loss: ", current_eval_loss)
#     if(current_eval_loss < min_loss):
#         min_loss = current_eval_loss
#         best_epoch = epoch
#         checkpoint(model, 'best_model.pth')
#     elif epoch - best_epoch > early_stop_threshold:
#         print("Early stopped training at epoch %d" % epoch)
#         break 
        
# resume(model, "best_model.pth")    
# model.save_pretrained('runs/saved_model/')
# tokenizer.save_pretrained('runs/saved_model/')

# for epoch in range(1, num_epochs+1):
#     model.train()
#     train_loss = 0
#     print(f"EPOCH {epoch}")
#     for batch in train_dataloader:
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         loss = outputs.loss
#         loss.backward()
#         train_loss += loss
#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
#         progress_bar.update(1)
#     e_loss = eval_loss(model)
#     print("Train Loss: ", train_loss / len(train_dataloader))
#     print("Eval loss: ", e_loss)
# #     train_losses.append(train_loss / len(train_dataloader))
# #     eval_losses.append(e_loss)
#     if(epoch % 5 == 0):
#         checkpoint(model, f"ckpt_ep-{epoch}.pt")    
# checkpoint(model, "saved_model.pt")

  0%|          | 0/2500 [00:00<?, ?it/s]

In [23]:
# torch.save(model.state_dict(), "code2pcfull.pt")
model.load_state_dict(torch.load("saved_model.pt"))

<All keys matched successfully>

In [24]:
small_test_dataset = dataset['test'].shuffle(seed=42).select(range(400))
# small_test_dataset = dataset['test']
test_dataloader = DataLoader(small_test_dataset, shuffle=True, batch_size=8)

In [25]:
for i in test_dataloader:
    print(i)
    break

{'pseudocode': ["TRANSLATOR_COMMENT_MARK is a string 'Translators'.", 'derive the class NowNode from Node base class.', 'call the method self.appendlist with 2 arguments: return value of the function force_text with 3 arguments:', 'save_as_new set to boolean False, prefix set to None, queryset set to None and unpacked dictionary kwargs. if instance is None,', 'break from the loop execution,', 'set cache to the value of the dictionary caches under the key settings.CACHE_MIDDLEWARE_ALIAS.', "get the value under the 'indent' key of the options dictionary, substitute the result for indent.", 'call the function resolver.resolve_error_handler with status_code as an argument, assign the result to the callback and param_dict, respectively.'], 'code': ["TRANSLATOR_COMMENT_MARK = 'Translators'", 'class NowNode ( Node ) :', "self . appendlist ( force_text ( key , encoding , errors = 'replace' ) , force_text ( value , encoding , errors = 'replace' ) )", 'if instance is None :', 'break', 'cache = c

In [21]:
# pip install evaluate

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [26]:
from datasets import load_metric
import evaluate
metric= evaluate.load("bleu")
model.eval()
pred = []
ref = []
for i in small_test_dataset:
    i['code'] = 'Generate Pseudocode: ' + i['code']
    input_ids = tokenizer(i['code'], return_tensors="pt").input_ids
    # input_ids = input_ids.to('cuda')
    generated_ids = model.generate(input_ids, max_length=64)
    pred.append(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
    ref.append([i['pseudocode']])
results = metric.compute(predictions=pred, references=ref)

In [27]:
print(results)

{'bleu': 0.7433723002343658, 'precisions': [0.904037927088442, 0.818086408955746, 0.7566296783900696, 0.7019289340101523], 'brevity_penalty': 0.9390004229441141, 'length_ratio': 0.9407874500153799, 'translation_length': 6117, 'reference_length': 6502}


In [28]:
fibonacci_code = ['nterms = int(input("How many terms? "))', 'n1, n2 = 0, 1', 'count = 0',
     'if nterms <= 0:', 'print("Please enter a positive integer")', 'elif nterms == 1:', 
     'print("Fibonacci sequence upto",nterms,":")', 'print(n1)', 'else:', 'print("Fibonacci sequence:")', 'while count < nterms:', 'print(n1)', 'nth = n1 + n2',
     '# update values', 'n1 = n2', 'n2 = nth', 'count += 1']

In [29]:
q_src_df

Unnamed: 0,q_id,q_prep_text
1,77593717,"[import hashlib, , user_hash_dict = {}, , with..."
2,77591118,"[London:Alpha, London, London:Beta, London:Del..."
2,77591118,"[London_sub:Alpha, London_sub, London_sub:Beta..."
2,77591118,"[names_df[0] = names_df[0] \, .str.split(':') ..."
2,77591118,"[L:o:n:d:o:n:_:s:u:b, ]"
...,...,...
2579,77567490,"[class GunicornApplication(BaseApplication):, ..."
2579,77567490,"[if __name__ == '__main__':, options = {'bind'..."
2579,77567490,[&gt;&gt;&gt; [ERROR] Worker (pid:10517) was s...
2579,77567490,[&gt;&gt;&gt; requests.exceptions.ConnectionEr...


In [39]:
def code2pseudo(src) : 
    pseudo_str = ""
    for i in src:
        # model = model.to('cuda')
        prefix = "Generate Pseudocode: "
        text = i
        text = prefix + text
        input_ids = tokenizer(text, return_tensors="pt").input_ids
        # input_ids = input_ids.to('cuda')
        generated_ids = model.generate(input_ids, max_length=128, top_p=0.95, top_k=50)
        transform_pseudo = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        print(transform_pseudo)
        pseudo_str = pseudo_str+''+ transform_pseudo
    return pseudo_str

In [40]:
# torch.save(model.state_dict(), 'code2pc_0.673bleu.pt')

In [41]:
q_src_df['code2pseudo'] = q_src_df['q_prep_text'].apply(code2pseudo)

KeyboardInterrupt: 

In [49]:
q_src_df_100 = q_src_df.head(1)

In [50]:
q_src_df_100

Unnamed: 0,q_id,q_prep_text
1,77593717,"[import hashlib, , user_hash_dict = {}, , with..."


In [51]:
q_src_df_100['code2pseudo'] = q_src_df_100['q_prep_text'].apply(code2pseudo)



KeyboardInterrupt: 