In [None]:
!pip install loralib==0.1.1 peft==0.3.0 --quiet
!pip install -q datasets evaluate transformers rouge-score nltk evaluate accelerate
!pip install -q wandb --upgrade
!pip install --upgrade --no-cache-dir -q gdown

In [None]:
import gdown

# fusion done
# fine tune done

url = "https://drive.google.com/uc?export=download&id="
# file_id = "1EW88u3Td6FNi9I07rBwg4sdAa8iFz9hq"
file_id = "1uIVXvimSqdyzFD0BglzAdQMajyc6FnBX"
# file_id = "1fmk9VzEcTN5L2Bmp8G_xj4otdtZRUN0U"
# https://drive.google.com/file/d/1uIVXvimSqdyzFD0BglzAdQMajyc6FnBX/view?usp=drive_link
# gdown.download(f"{url}/{file_id}&confirm=t", "model.zip", use_cookies= True)
link = f"{url}{file_id}&confirm=t"
print(link)

In [None]:
!wget --no-check-certificate "https://drive.google.com/uc?export=download&id=1uIVXvimSqdyzFD0BglzAdQMajyc6FnBX&confirm=t" -O "model.zip"

In [None]:
!unzip -q model.zip 

In [None]:
from datasets import load_dataset, Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

In [None]:
model_name = "fine-tuned-FLAN-T5"
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
# from datasets import load_dataset
# import pandas as pd
# from datasets import Dataset

# df = pd.read_csv("/kaggle/input/code2doc-eval/code2doc.csv", skip_blank_lines= True, na_values= ["None", "N/A", "NA", "\n"])
# df.head()
!wget https://zenodo.org/record/7857872/files/python.zip
!unzip -q python.zip
!rm -rf python.zip
!gzip -d -q "/kaggle/working/python/final/jsonl/test/python_test_0.jsonl.gz"
!mv python/final/jsonl/test/ .

In [None]:
import pandas as pd

df = pd.read_json("test/python_test_0.jsonl", lines= True)

df = df.drop(['repo', 'path', 'language', 'code_tokens', 'docstring_tokens', 'sha', 'url', 'partition', 'original_string'], axis= 1)

In [None]:
def remove_documentation(function_str):
    # Use regular expressions to match and remove the docstring and comments
    pattern = r'(def\s+\w+\s*\(.*\):)(?:.*?)^(\s*)\"\"\".*?\"\"\"'
    function_without_doc = re.sub(pattern, r'\1', function_str, flags=re.DOTALL | re.MULTILINE)

    # Remove single-line comments
    function_without_comments = re.sub(r'#.*', '', function_without_doc)
    function_without_new_lines = re.sub(r'\n', ' ', function_without_comments)

    return function_without_new_lines

In [None]:
df['Code'] = df.apply(lambda x: f"This is the function name:\n {x['func_name']}\n Code:\n {remove_documentation(x['code'])}", axis=1)
df['Comment'] = df['docstring']

In [None]:
df.head()

In [None]:
df = df.dropna(how='all')

dataset = Dataset.from_pandas(df.drop(['code', 'func_name', 'docstring'], axis= 1))
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset['train']
test_dataset = dataset['test']

In [None]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f'''model parameters: {trainable_model_params:,}\n'''

full_model = print_number_of_trainable_model_parameters(model)
print(full_model)

In [None]:
import re

def remove_extra_spaces(input_string):
    input_string = re.sub(r'^\s+|\s+$', ' ', input_string, flags=re.MULTILINE)
    input_string = re.sub(r'[(?:https?|ftp|www)\S]*', '', input_string, flags=re.MULTILINE)
    input_string = re.sub(r'\n\s*\n', '\n\n', input_string)
    return input_string

def remove_non_alphanumeric(input_string):
#     if input_string is None:
#         return "Cannot be determined"
#     input_string = remove_extra_spaces(input_string)
#     pattern = r'[^a-zA-Z0-9_ \\n:()=<>!,\[\].+*^&|~@#%`\\-\\\\]+'
#     result = re.sub(pattern, ' ', input_string)
#     input_string = input_string.strip()
    return input_string

def tokenize_function(data):
    query = [remove_non_alphanumeric(q) for q in data['Comment']]
    question = [remove_non_alphanumeric(q) for q in data['Code']]
    tokenized_data = tokenizer(question, text_target= query, padding= 'max_length', truncation = True)
    return tokenized_data

In [None]:
# tokenized_train_datasets = train_dataset.map(tokenize_function, batched= True)
tokenized_test_datasets = test_dataset.map(tokenize_function, batched= True)

In [None]:
n = len(tokenized_test_datasets)
print(f"Shapes of the test dataset:")
print(f"Test: {tokenized_test_datasets.shape}")

In [None]:
import numpy as np
import evaluate

def evaluation_metric(metric_name= 'rouge'):
    metric = evaluate.load(metric_name)
    return metric

In [None]:
model = model.to(device)

In [None]:
def predict(input_text, max_length= 1024, decode= True):
#     input_text= "## Provide the documentation of the following code: ## \n\n" + input_text
    input_ids = tokenizer(input_text, return_tensors="pt", padding= "max_length", truncation= True)['input_ids']
    input_ids = input_ids.to(device)
    
    with torch.no_grad():
        output = model.generate(input_ids, max_length= max_length)
    if decode:
        generated_text = tokenizer.batch_decode(output, skip_special_tokens=True)
        return generated_text
    return output

In [None]:
def print_scores(metric_name, scores):
    print(f"Average {metric_name} Scores:")
    for m in scores:
        print(f"{m}: {scores[m]}")

In [None]:
def calculate_scores(metric, batch_size = 50):
    ex = metric.compute(references= ["x"], predictions= ["x"])
    scores = {m: 0 for m in ex.keys()}
    
    for i in tqdm(range(0, n, batch_size)):
        code = predict(tokenized_test_datasets[i:i+batch_size]['Code'])
        doc = tokenized_test_datasets[i:i+batch_size]['Comment']
        pred = predict(code)
        score = metric.compute(references= doc, predictions= pred)
        
        for m in score:
            if type(score[m]) == list:
                continue
            scores[m] += score[m]
        
    return scores

In [None]:
metric_name = 'rouge'
metric1 = evaluation_metric(metric_name)
scores1 = calculate_scores(metric1)
print_scores(metric_name, scores1)

In [None]:
metric_name = 'bleu'
metric2 = evaluation_metric(metric_name)
scores2 = calculate_scores(metric2)
print_scores(metric_name, scores2)