In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
# ! pip install datasets
! pip install torch



In [2]:
! pip install fasttext



Loading the model

In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

# special tokens for prompting
system_token = "<SYSTEM_TASK:>"
user_token = "<USER_TASK:>"
assistant_token = "<ASSISTANT_TASK:>"
end_token = "<END_TASK>"


model_checkpoint = 'Salesforce/codegen-350M-mono'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,
                                          additional_special_tokens = ["<SYSTEM_TASK:>", "<USER_TASK:>", "<ASSISTANT_TASK:>", "<END_TASK>"],
                                          pad_token = "<PAD>",
                                          )

Downloading (…)okenizer_config.json:   0%|          | 0.00/240 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
tokenizer

CodeGenTokenizerFast(name_or_path='Salesforce/codegen-350M-mono', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<PAD>', 'additional_special_tokens': ['<SYSTEM_TASK:>', '<USER_TASK:>', '<ASSISTANT_TASK:>', '<END_TASK>']}, clean_up_tokenization_spaces=True)

In [5]:
special_token_dict = tokenizer.special_tokens_map
print(special_token_dict)

{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<PAD>', 'additional_special_tokens': ['<SYSTEM_TASK:>', '<USER_TASK:>', '<ASSISTANT_TASK:>', '<END_TASK>']}


In [6]:
tokenizer.add_special_tokens(special_token_dict)

0

In [7]:
context_length = 3000

In [8]:
import numpy as np
from transformers import BitsAndBytesConfig, AutoConfig, AutoModelForCausalLM, AutoModelWithLMHead
import torch
from accelerate import init_empty_weights, infer_auto_device_map

In [9]:
model_checkpoint = 'Salesforce/codegen-350M-mono'

config = AutoConfig.from_pretrained(model_checkpoint,
                                    vocab_size=tokenizer.vocab_size,
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
)

with init_empty_weights():
    model = AutoModelForCausalLM.from_config(config)

print(model)

Downloading (…)lve/main/config.json:   0%|          | 0.00/999 [00:00<?, ?B/s]

CodeGenForCausalLM(
  (transformer): CodeGenModel(
    (wte): Embedding(50257, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-19): 20 x CodeGenBlock(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): CodeGenMLP(
          (fc_in): Linear(in_features=1024, out_features=4096, bias=True)
          (fc_out): Linear(in_features=4096, out_features=1024, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=True)
)


In [10]:
device_map = infer_auto_device_map(model, no_split_module_classes = ['CodeGenBlock'])
print(device_map)

{'': 0}


In [11]:
import torch

nf4_config = BitsAndBytesConfig(
   load_in_2bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)


model_id = 'Salesforce/codegen-350M-mono'
model =AutoModelForCausalLM.from_pretrained(model_id,
                                            config = config,
                                            device_map="auto",
                                            quantization_config=nf4_config,
                                            torch_dtype=torch.bfloat16)

Downloading pytorch_model.bin:   0%|          | 0.00/797M [00:00<?, ?B/s]



In [12]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 356712448
all model parameters: 356712448
percentage of trainable model parameters: 100.00%


In [13]:
model.resize_token_embeddings(len(tokenizer))

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50300. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(50300, 1024)

In [14]:
import numpy as np
from transformers import BitsAndBytesConfig, AutoConfig, AutoModelForCausalLM, AutoModelWithLMHead
from transformers import GenerationConfig
import torch
from accelerate import init_empty_weights, infer_auto_device_map
import re
from functools import reduce

**TRYING DIFFERENT RPOMPTS**

In [26]:
add_prompt = '<SYSTEM_TASK:>\nGiven the following code description, write Python code to implement the functionality described below\n<END_TASK>\n<USER_TASK:>\nDescription:\n'

text_prompt = 'def python_task1():"""Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function"""def python_task2():"""Split the data X, Y in to train and test data using sklearn"""def python_task3():"""Make prediction using the deep learning model defined above in VGG16_model_architecture function"""'

final_prompt = add_prompt + text_prompt
print(final_prompt)

<SYSTEM_TASK:>
Given the following code description, write Python code to implement the functionality described below
<END_TASK>
<USER_TASK:>
Description:
def python_task1():"""Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function"""def python_task2():"""Split the data X, Y in to train and test data using sklearn"""def python_task3():"""Make prediction using the deep learning model defined above in VGG16_model_architecture function"""


This prompt is working for now

In [47]:
add_prompt = '<SYSTEM_TASK:>\nGiven the following code description, write Python code to implement the functionality described below\n<END_TASK>\n<USER_TASK:>\nDescription:\n'
text_prompt = 'def VGG16_model_architecture():"""Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function"""def train_test_split():"""Split the data X, Y in to train and test data using sklearn"""def model_predict():"""Make prediction using the deep learning model defined above in VGG16_model_architecture function"""'
final_prompt = add_prompt + text_prompt


generation_config = GenerationConfig(max_new_tokens=500, temperature=0.6, do_sample = True, top_p = 3)
inputs = tokenizer(final_prompt, return_tensors = 'pt')

output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        generation_config = generation_config,
        max_new_tokens = 500,
        pad_token_id = tokenizer.pad_token_id,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{final_prompt}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
<SYSTEM_TASK:>
Given the following code description, write Python code to implement the functionality described below
<END_TASK>
<USER_TASK:>
Description:
def VGG16_model_architecture():"""Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function"""def train_test_split():"""Split the data X, Y in to train and test data using sklearn"""def model_predict():"""Make prediction using the deep learning model defined above in VGG16_model_architecture function"""
---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:

Given the following code description, write Python code to implement the functionality described below


Description:
def VGG16_model_architecture():"""Initialize VGG16 model a deep learning model trained on ima

This is also working (try in on orignal dataset)

In [28]:
add_prompt = '<SYSTEM_TASK:>\nGiven the following code description, write Python code to implement the functionality described below\n<END_TASK>\n<USER_TASK:>\nDescription:\n'
text_prompt = 'def python_task_1():\n"""\nInitialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function\n"""\ndef python_task_2():\n"""\nSplit the data X, Y in to train and test data using sklearn\n"""\ndef python_task_3():\n"""\nMake prediction using the deep learning model defined above in VGG16_model_architecture function\n"""\n'
final_prompt = add_prompt + text_prompt
print(final_prompt)

<SYSTEM_TASK:>
Given the following code description, write Python code to implement the functionality described below
<END_TASK>
<USER_TASK:>
Description:
def python_task_1():
"""
Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function
"""
def python_task_2():
"""
Split the data X, Y in to train and test data using sklearn
"""
def python_task_3():
"""
Make prediction using the deep learning model defined above in VGG16_model_architecture function
"""



In [34]:
add_prompt = '<SYSTEM_TASK:>\nGiven the following code description, write Python code to implement the functionality described below\n<END_TASK>\n<USER_TASK:>\nDescription:\n'
text_prompt = 'def python_task_1():\n"""\nInitialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function\n"""\ndef python_task_2():\n"""\nSplit the data X, Y in to train and test data using sklearn\n"""\ndef python_task_3():\n"""\nMake prediction using the deep learning model defined above in VGG16_model_architecture function\n"""\n'
final_prompt = add_prompt + text_prompt

generation_config = GenerationConfig(max_new_tokens=500, temperature=0.9, do_sample = True, top_p = 3)
inputs = tokenizer(final_prompt, return_tensors = 'pt')

output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        generation_config = generation_config,
        max_new_tokens = 500,
        pad_token_id = tokenizer.pad_token_id,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{final_prompt}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
<SYSTEM_TASK:>
Given the following code description, write Python code to implement the functionality described below
<END_TASK>
<USER_TASK:>
Description:
def python_task_1():
"""
Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function
"""
def python_task_2():
"""
Split the data X, Y in to train and test data using sklearn
"""
def python_task_3():
"""
Make prediction using the deep learning model defined above in VGG16_model_architecture function
"""

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:

Given the following code description, write Python code to implement the functionality described below


Description:
def python_task_1():
"""
Initialize VGG16 model a deep learning model trained on imagenet for p

In [46]:
add_prompt = '<SYSTEM_TASK:>\nGiven the following code description, write Python code to implement the functions described below line by line\n<END_TASK>\n<USER_TASK:>\nDescription:\n'
text_prompt = 'def Initialize_VGG16_model_a_deep_learning_model_trained_on_imagenet_for_performing_Image_Classification_in_the_VGG16_model_architecture_function():\ndef Split_the_data_X,_Y_in_to_train_and_test_data_using_sklearn():\ndef Make_prediction_using_the_deep_learning_model_defined_above_in_VGG16_model_architecture_function():'
final_prompt = add_prompt + text_prompt

generation_config = GenerationConfig(max_new_tokens=500, temperature=1.2, do_sample = True, top_p = 3)
inputs = tokenizer(final_prompt, return_tensors = 'pt')

output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        generation_config = generation_config,
        max_new_tokens = 500,
        pad_token_id = tokenizer.pad_token_id,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{final_prompt}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
<SYSTEM_TASK:>
Given the following code description, write Python code to implement the functions described below line by line
<END_TASK>
<USER_TASK:>
Description:
def Initialize_VGG16_model_a_deep_learning_model_trained_on_imagenet_for_performing_Image_Classification_in_the_VGG16_model_architecture_function():
def Split_the_data_X,_Y_in_to_train_and_test_data_using_sklearn():
def Make_prediction_using_the_deep_learning_model_defined_above_in_VGG16_model_architecture_function():
---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:

Given the following code description, write Python code to implement the functions described below line by line


Description:
def Initialize_VGG16_model_a_deep_learning_model_trained_on_imagenet_for_performing_Image_Classification_in_the_VGG16_model_architecture_functio

In [48]:
text_prompt = 'def VGG16_model_architecture():"""Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function"""def train_test_split():"""Split the data X, Y in to train and test data using sklearn"""def model_predict():"""Make prediction using the deep learning model defined above in VGG16_model_architecture function"""'
print(text_prompt)

def VGG16_model_architecture():"""Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function"""def train_test_split():"""Split the data X, Y in to train and test data using sklearn"""def model_predict():"""Make prediction using the deep learning model defined above in VGG16_model_architecture function"""


In [23]:
add_prompt = '<SYSTEM_TASK:>\nSolve the following problem using Python, implementing the functions described below, one line at a time\n<END_TASK>\n<USER_TASK:>\nDescription:\n'
text_prompt = 'def python_task_1():\n""" This function initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function """\ndef python_task_2():\n""" This function Split the data X, Y in to train and test data using sklearn """\ndef python_task_3():\n""" This function Make prediction using the deep learning model defined above in VGG16_model_architecture function """\n'
final_prompt = add_prompt + text_prompt
print(final_prompt)

<SYSTEM_TASK:>
Solve the following problem using Python, implementing the functions described below, one line at a time
<END_TASK>
<USER_TASK:>
Description:
def python_task_1():
""" This function initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function """
def python_task_2():
""" This function Split the data X, Y in to train and test data using sklearn """
def python_task_3():
""" This function Make prediction using the deep learning model defined above in VGG16_model_architecture function """



In [24]:
generation_config = GenerationConfig(max_new_tokens=500, temperature= 0.9, do_sample = True, top_p = 3)
inputs = tokenizer(final_prompt, return_tensors = 'pt')

output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        generation_config = generation_config,
        max_new_tokens = 500,
        pad_token_id = tokenizer.pad_token_id,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{final_prompt}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')



---------------------------------------------------------------------------------------------------
INPUT PROMPT:
<SYSTEM_TASK:>
Solve the following problem using Python, implementing the functions described below, one line at a time
<END_TASK>
<USER_TASK:>
Description:
def python_task_1():
""" This function initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function """
def python_task_2():
""" This function Split the data X, Y in to train and test data using sklearn """
def python_task_3():
""" This function Make prediction using the deep learning model defined above in VGG16_model_architecture function """

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:

Solve the following problem using Python, implementing the functions described below, one line at a time


Description:
def python_task_1():
""" This function initialize

PROMPTING THE FULL DATASET IN THE SAME FORMAT AS THE ABOVE PROMPT

In [15]:
from datasets import load_dataset
dataset_1 = load_dataset("codeparrot/xlcost-text-to-code", "Python-program-level")

Downloading builder script:   0%|          | 0.00/7.61k [00:00<?, ?B/s]

Downloading and preparing dataset xlcost/Python-program-level to /root/.cache/huggingface/datasets/codeparrot___xlcost/Python-program-level/2.1.0/ffae7d034dfaa9e215012bcf52b8690f3ae22d9c52f45fe2ffd3dcf4093d9f2c...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/570k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset xlcost downloaded and prepared to /root/.cache/huggingface/datasets/codeparrot___xlcost/Python-program-level/2.1.0/ffae7d034dfaa9e215012bcf52b8690f3ae22d9c52f45fe2ffd3dcf4093d9f2c. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
dataset_1

DatasetDict({
    train: Dataset({
        features: ['text', 'code'],
        num_rows: 9263
    })
    test: Dataset({
        features: ['text', 'code'],
        num_rows: 887
    })
    validation: Dataset({
        features: ['text', 'code'],
        num_rows: 472
    })
})

In [17]:
# special tokens for prompting
system_token = "<SYSTEM_TASK:>"
user_token = "<USER_TASK:>"

def process_1(data):
    text_prompts = list(map(lambda x: x.replace('|', ','), data['text']))
    text_prompts = list(map(lambda x: x.replace('\n', ','), data['text']))
    prompt = ''
    final_code = ''
    final_text_prompt = []
    final_code_prompt = []
    code = data['code']
    add_prompt = '<SYSTEM_TASK:>\nSolve the following problem using Python, implementing the functions described below, one line at a time\n<END_TASK>\n<USER_TASK:>\nDescription:\n'
    for i in range(len(text_prompts)):
        prompt =  text_prompts[i]
        new_prompt = add_prompt
        arr = prompt.split(';')
        for i in range(len(arr)):
            arr[i] = arr[i].replace('|', '')
            arr[i] = arr[i].lower()
            new_prompt = new_prompt + f'def python_task_{i}():\n' + f'""" This function {arr[i]}"""\n'
        code[i] = code[i].replace('NEW_LINE INDENT' , '\n\t')
        code[i] = code[i].replace('NEW_LINE' , '\n')
        code[i] = code[i].replace('DEDENT' , '\b')
        code[i] = code[i].replace(' ( ' , '(')
        code[i] = code[i].replace(' [ ', '[')
        code[i] = code[i].replace('_ ', '_')
        code[i] = code[i].replace('" ', '"')
        final_code = code[i]
        final_text_prompt.append(new_prompt)
        final_code_prompt.append(final_code)
    return {
      'text_prompt' : final_text_prompt,
      'code_prompt' : final_code_prompt
    }

In [18]:
final_text_prompt = []
final_code_prompt = []
prompted_data_1 = dataset_1.map(process_1, batched = True, remove_columns = ['text', 'code'])
prompted_data_1

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text_prompt', 'code_prompt'],
        num_rows: 9263
    })
    test: Dataset({
        features: ['text_prompt', 'code_prompt'],
        num_rows: 887
    })
    validation: Dataset({
        features: ['text_prompt', 'code_prompt'],
        num_rows: 472
    })
})

In [19]:
print(prompted_data_1['train'][24]['text_prompt'])

<SYSTEM_TASK:>
Solve the following problem using Python, implementing the functions described below, one line at a time
<END_TASK>
<USER_TASK:>
Description:
def python_task_0():
""" This function count of repeating digits in a given number  function that returns the count of repeating digits of the given number """
def python_task_1():
""" This function  initialize a variable to store count of repeating digits """
def python_task_2():
""" This function  initialize cnt array to store digit count """
def python_task_3():
""" This function  iterate through the digits of n """
def python_task_4():
""" This function  retrieve the last digit of n """
def python_task_5():
""" This function  increase the count of digit """
def python_task_6():
""" This function  remove the last digit of n """
def python_task_7():
""" This function  iterate through the cnt array """
def python_task_8():
""" This function  if frequency of digit is greater than 1 """
def python_task_9():
""" This function  increm

In [20]:
print(prompted_data_1['train'][24]['code_prompt'])

mod = 1000000007 
 def ValOfTheExpression(n ) : 
	 global mod 
 factorial =[0 for i in range(n + 1 ) ] 
 factorial[0 ] = 1 
 factorial[1 ] = 1 
 for i in range(2 , n + 1 , 1 ) : 
	 factorial[i ] =(( factorial[i - 1 ] % mod ) *(i % mod ) ) % mod 
 dp =[0 for i in range(n + 1 ) ] 
 dp[1 ] = 1 
 for i in range(2 , n + 1 , 1 ) : 
	 dp[i ] =(( dp[i - 1 ] % mod ) *(factorial[i ] % mod ) ) % mod 
 return dp[n ] 
 if __name__== ' __main __' : 
	 n = 4 
 print(ValOfTheExpression(n ) ) 



In [22]:
final_prompt = prompted_data_1['train'][111]['text_prompt']
code = prompted_data_1['train'][111]['code_prompt']

generation_config = GenerationConfig(max_new_tokens=100, temperature= 0.8, do_sample = True, top_p = 3)
inputs = tokenizer(final_prompt, return_tensors = 'pt')

output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        generation_config = generation_config,
        max_new_tokens = 100,
        pad_token_id = tokenizer.pad_token_id,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{final_prompt}')
print(dash_line)
print(f'HUMAN BASELINE CODE:\n{code}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')



---------------------------------------------------------------------------------------------------
INPUT PROMPT:
<SYSTEM_TASK:>
Solve the following problem using Python, implementing the functions described below, one line at a time
<END_TASK>
<USER_TASK:>
Description:
def python_task_0():
""" This function program to determine the quadrant of a complex number  function to determine the quadrant of a complex number """
def python_task_1():
""" This function  storing the index of '+ """
def python_task_2():
""" This function  storing the index of '- """
def python_task_3():
""" This function  finding the real part of the complex number """
def python_task_4():
""" This function  finding the imaginary part of the complex number """
def python_task_5():
""" This function  driver code"""

---------------------------------------------------------------------------------------------------
HUMAN BASELINE CODE:
def sameProductQuadruples(nums , N ) : 
	 umap = { } ; 
 res = 0 ; 
 for i in rang

In [15]:
! pip install git+https://github.com/huggingface/datasets#egg=datasets

Collecting datasets
  Cloning https://github.com/huggingface/datasets to /tmp/pip-install-k6bkleq3/datasets_9f2cc6ba46184cdc8916ac1967f2d857
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/datasets /tmp/pip-install-k6bkleq3/datasets_9f2cc6ba46184cdc8916ac1967f2d857
  Resolved https://github.com/huggingface/datasets to commit a6fb8b9a833afb25311da395c6e0d9bf770ca2c7
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting fsspec[http]<2023.9.0,>=2023.1.0 (from datasets)
  Downloading fsspec-2023.6.0-py3-none-any.whl (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/163.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Building wheels for collected packages: datasets
  Building wheel for datasets (pyproject.toml) ... [?25ldone
[?25h  Created wheel for datasets: filename=datasets-2.14.6.de

In [16]:
from datasets import load_dataset
dataset_2 = load_dataset("code_x_glue_ct_code_to_text", "python", download_mode="force_redownload")

Downloading builder script:   0%|          | 0.00/5.92k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/17.9k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/25.7k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/941M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/941M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/941M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/251820 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13914 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/14918 [00:00<?, ? examples/s]

In [17]:
dataset_2

DatasetDict({
    train: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 251820
    })
    validation: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 13914
    })
    test: Dataset({
        features: ['id', 'repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url'],
        num_rows: 14918
    })
})

In [18]:
print(dataset_2['train'][91]['code'])

def tag(self, tag):
        """Get a release by tag
        """
        url = '%s/tags/%s' % (self, tag)
        response = self.http.get(url, auth=self.auth)
        response.raise_for_status()
        return response.json()


In [19]:
print(dataset_2['train'][91]['docstring'])

Get a release by tag


In [20]:
def dataset_formation(data):
    add_prompt = '<SYSTEM_TASK:>\nSolve the following problem using Python, implementing the functions described below, one line at a time\n<END_TASK>\n<USER_TASK:>\nDescription:\n'
    doc = data['docstring'].split('\n')
    cleaned_doc = []
    if len(doc) != 0:
        for i in range(len(doc)):
            if ':' in doc[i]:
                pass
            else:
                cleaned_doc.append(doc[i])
    final_text = ''
    for i in range(len(cleaned_doc)):
        final_text = final_text + ' ' + cleaned_doc[i]
    s = data['code']
    code = ''
    final_prompt = ''
    occurrences = re.finditer('"""', s)
    # using reduce() to get start indices of all occurrences
    res = reduce(lambda x, y: x + [y.start()], occurrences, [])
    extracted_prompt = ''
    if len(res) > 1:     
        pos = res[1]
        prompt = data['code'][:pos+3]
        lines = prompt.split('\n')
        cleaned_extracted_prompt = []
        for i in range(len(lines)):
            if '>>>' in lines[i] or '...' in lines[i] or '----------' in lines[i]:
                pass
            else:
                cleaned_extracted_prompt.append(lines[i])
        extracted_prompt = ''
        for j in range(len(cleaned_extracted_prompt)):
            extracted_prompt = extracted_prompt + ' ' + cleaned_extracted_prompt[j]
        extracted_prompt = re.sub(' +', ' ', extracted_prompt)
        extracted_prompt = extracted_prompt.replace('):' , '):\n')
        code = s.replace(prompt, '')
    else:
        code = data['code']
    
    final_prompt = add_prompt + extracted_prompt
    return{
        'text_prompt' : final_prompt,
        'code_prompt' : code
    }

In [21]:
prompted_dataset_2 = dataset_2.map(dataset_formation, batched = False, remove_columns = dataset_2['train'].column_names)

Map:   0%|          | 0/251820 [00:00<?, ? examples/s]

Map:   0%|          | 0/13914 [00:00<?, ? examples/s]

Map:   0%|          | 0/14918 [00:00<?, ? examples/s]

In [22]:
prompted_dataset_2

DatasetDict({
    train: Dataset({
        features: ['text_prompt', 'code_prompt'],
        num_rows: 251820
    })
    validation: Dataset({
        features: ['text_prompt', 'code_prompt'],
        num_rows: 13914
    })
    test: Dataset({
        features: ['text_prompt', 'code_prompt'],
        num_rows: 14918
    })
})

In [23]:
print(prompted_dataset_2['train'][91]['text_prompt'])

<SYSTEM_TASK:>
Solve the following problem using Python, implementing the functions described below, one line at a time
<END_TASK>
<USER_TASK:>
Description:
 def tag(self, tag):
 """Get a release by tag """


In [24]:
print(prompted_dataset_2['train'][91]['code_prompt'])


        url = '%s/tags/%s' % (self, tag)
        response = self.http.get(url, auth=self.auth)
        response.raise_for_status()
        return response.json()


In [25]:
final_prompt = prompted_dataset_2['train'][1435]['text_prompt']
code = prompted_dataset_2['train'][1435]['code_prompt']

generation_config = GenerationConfig(max_new_tokens=500, temperature= 1.2, do_sample = True, top_p = 3)
inputs = tokenizer(final_prompt, return_tensors = 'pt')

output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        generation_config = generation_config,
        max_new_tokens = 500,
        pad_token_id = tokenizer.pad_token_id,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{final_prompt}')
print(dash_line)
print(f'HUMAN BASELINE CODE:\n{code}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')



---------------------------------------------------------------------------------------------------
INPUT PROMPT:
<SYSTEM_TASK:>
Solve the following problem using Python, implementing the functions described below, one line at a time
<END_TASK>
<USER_TASK:>
Description:
 def _get_data(self, time, site_id):
 r"""Download and parse upper air observations from an online archive. Parameters time : datetime The date and time of the desired observation. site_id : str The three letter ICAO identifier of the station for which data should be downloaded. Returns ------- :class:`pandas.DataFrame` containing the data """
---------------------------------------------------------------------------------------------------
HUMAN BASELINE CODE:

        raw_data = self._get_data_raw(time, site_id)
        soup = BeautifulSoup(raw_data, 'html.parser')
        tabular_data = StringIO(soup.find_all('pre')[0].contents[0])
        col_names = ['pressure', 'height', 'temperature', 'dewpoint', 'direction', 's

**The above dataset is working GOOD**

In [25]:
from datasets import load_dataset
dataset_3 = load_dataset("openai_humaneval")

Downloading builder script:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.33k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.40k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.9k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/164 [00:00<?, ? examples/s]

In [26]:
dataset_3

DatasetDict({
    test: Dataset({
        features: ['task_id', 'prompt', 'canonical_solution', 'test', 'entry_point'],
        num_rows: 164
    })
})

In [27]:
print(dataset_3['test'][41]['prompt'])



def car_race_collision(n: int):
    """
    Imagine a road that's a perfectly straight infinitely long line.
    n cars are driving left to right;  simultaneously, a different set of n cars
    are driving right to left.   The two sets of cars start out being very far from
    each other.  All cars move in the same speed.  Two cars are said to collide
    when a car that's moving left to right hits a car that's moving right to left.
    However, the cars are infinitely sturdy and strong; as a result, they continue moving
    in their trajectory as if they did not collide.

    This function outputs the number of such collisions.
    """



In [28]:
print(dataset_3['test'][10]['canonical_solution'])

    if not string:
        return ''

    beginning_of_suffix = 0

    while not is_palindrome(string[beginning_of_suffix:]):
        beginning_of_suffix += 1

    return string + string[:beginning_of_suffix][::-1]



In [29]:
def dataset_formation(data):
    add_prompt = '<SYSTEM_TASK:>\nSolve the following problem using Python, implementing the functions described below, one line at a time\n<END_TASK>\n<USER_TASK:>\nDescription:\n'
    prompt = data['prompt']
    prompt = prompt.replace('"""\n', '"""')
    prompt = prompt.replace('\n"""', '"""')
    prompt_copy = prompt
    lines = prompt.split('\n')
    occurrences = re.finditer('"""', prompt)
    res = reduce(lambda x, y: x + [y.start()], occurrences, [])
    final_cleaned = ''
    if len(res) != 0:
        for i in range(int(len(res)/2)):
            cleaned_prompt = ''
            clnd = []
            line = prompt[res[2*i]+ 3 : res[2*i - 1]].split('\n')
            for j in range(len(line)):
                if ':' in line[j] or '*' in line[j] or '>>>' in line[j] or '=>' in line[j] or '->' in line[j]:
                    pass
                else:
                    clnd.append(line[j])
            for k in range(len(clnd)):
                cleaned_prompt = cleaned_prompt + clnd[k] + '\n'
            cleaned_prompt = cleaned_prompt.replace('\n', '')
            cleaned_prompt = re.sub(' +', ' ', cleaned_prompt)
            final_cleaned = prompt_copy.replace(prompt[res[i] + 3 : res[2*i - 1]], cleaned_prompt)
    if final_cleaned == '':
        final_prompt = None
    else:
        final_prompt = add_prompt + final_cleaned
    code = data['canonical_solution']
    return{
        'text_prompt' : final_prompt,
        'code_prompt' : code
    }

In [30]:
prompted_dataset_3 = dataset_3.map(dataset_formation, batched = False, remove_columns = dataset_3['test'].column_names)

Map:   0%|          | 0/164 [00:00<?, ? examples/s]

In [31]:
prompted_dataset_3

DatasetDict({
    test: Dataset({
        features: ['text_prompt', 'code_prompt'],
        num_rows: 164
    })
})

In [32]:
print(prompted_dataset_3['test'][114]['text_prompt'])

<SYSTEM_TASK:>
Solve the following problem using Python, implementing the functions described below, one line at a time
<END_TASK>
<USER_TASK:>
Description:

def minSubArraySum(nums):
    """ Given an array of integers nums, find the minimum sum of any non-empty sub-array of nums. Example minSubArraySum([2, 3, 4, 1, 2, 4]) == 1 minSubArraySum([-1, -2, -3]) == -6 """


In [33]:
print(prompted_dataset_3['test'][114]['code_prompt'])

    max_sum = 0
    s = 0
    for num in nums:
        s += -num
        if (s < 0):
            s = 0
        max_sum = max(s, max_sum)
    if max_sum == 0:
        max_sum = max(-i for i in nums)
    min_sum = -max_sum
    return min_sum



In [114]:
final_prompt = prompted_dataset_3['test'][114]['text_prompt']
code = prompted_dataset_3['test'][114]['code_prompt']
inputs = tokenizer(final_prompt, return_tensors = 'pt').to('cuda')
generation_config = GenerationConfig(max_new_tokens=250, temperature= 0.9, do_sample = True, top_p = 3)

output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens = 250,
        generation_config=generation_config,
        pad_token_id = tokenizer.pad_token_id,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{final_prompt}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')
print(dash_line)
print(f'BASELINE HUMAN WRITTEN CODE:\n{code}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
<SYSTEM_TASK:>
Solve the following problem using Python, implementing the functions described below, one line at a time
<END_TASK>
<USER_TASK:>
Description:

def minSubArraySum(nums):
    """ Given an array of integers nums, find the minimum sum of any non-empty sub-array of nums. Example minSubArraySum([2, 3, 4, 1, 2, 4]) == 1 minSubArraySum([-1, -2, -3]) == -6 """
---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:

Solve the following problem using Python, implementing the functions described below, one line at a time


Description:

def minSubArraySum(nums):
    """ Given an array of integers nums, find the minimum sum of any non-empty sub-array of nums. Example minSubArraySum([2, 3, 4, 1, 2, 4]) == 1 minSubArraySum([-1, -2, -3]) == -6 """
    # Your code here
    if not nums:
        return 0

**Working Good**

In [34]:
from datasets import load_dataset
dataset_4 = load_dataset("codeparrot/github-jupyter-code-to-text")

Downloading readme:   0%|          | 0.00/857 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/227M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/56.9M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [35]:
dataset_4

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'license', 'content'],
        num_rows: 47452
    })
    test: Dataset({
        features: ['repo_name', 'path', 'license', 'content'],
        num_rows: 11864
    })
})

In [36]:
print(dataset_4['train'][23]['path'])    # useful
print(dataset_4['train'][23]['repo_name'])
print(dataset_4['train'][23]['license'])

lucid_work/notebooks/feature_visualization.ipynb
davidparks21/qso_lya_detection_pipeline
mit


In [37]:
print(dataset_4['train'][263]['content'])

%run ../bst/bst.py
%load ../bst/bst.py

def height(node):
    # TODO: Implement me
    pass

"""
Explanation: <small><i>This notebook was prepared by Donne Martin. Source and license info is on GitHub.</i></small>
Challenge Notebook
Problem: Determine the height of a tree.

Constraints
Test Cases
Algorithm
Code
Unit Test
Solution Notebook

Constraints

Is this a binary tree?
Yes


Can we assume we already have a Node class with an insert method?
Yes



Test Cases

5 -> 1
5, 2, 8, 1, 3 -> 3

Algorithm
Refer to the Solution Notebook.  If you are stuck and need a hint, the solution notebook's algorithm discussion might be a good place to start.
Code
End of explanation
"""


# %load test_height.py
from nose.tools import assert_equal


class TestHeight(object):

    def test_height(self):
        root = Node(5)
        assert_equal(height(root), 1)
        insert(root, 2)
        insert(root, 8)
        insert(root, 1)
        insert(root, 3)
        assert_equal(height(root), 3)

        p

In [38]:
import fasttext
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
detect = fasttext.load_model(model_path)

Downloading model.bin:   0%|          | 0.00/1.18G [00:00<?, ?B/s]



In [39]:
def detect_lang(data):
    lines = data.split('\n')
    text = ''
    for i in range(len(lines)):
        if i < 4:
            text = text + lines[i]
        else:
            break
    lang = detect.predict(text)[0][0]
    if lang == '__label__eng_Latn':
        return 'en'
    else:
        return ''

In [40]:
def dataset_formation(data):
    add_prompt = '<SYSTEM_TASK:>\nSolve the following problem using Python, implementing the functions described below, one line at a time\n<END_TASK>\n<USER_TASK:>\nDescription:\n'
    s = data['content']
    dummy = data['content']
    occurrences = re.finditer('"""', s)
    # using reduce() to get start indices of all occurrences
    res = reduce(lambda x, y: x + [y.start()], occurrences, [])
    extracted_prompt_arr = []
    extracted_code = ''
    final_prompt = ''
    code = ''
    if len(res)%2 == 0:
        for i in range(int(len(res)/2)):
            extracted_prompt_arr.append(s[res[2*i] + 3 : res[2*i + 1]] + '\n')
            dummy = dummy.replace(s[res[2*i] : res[2*i + 1] + 3], '')
        final = ''
        extracted_code = dummy
        # LANGUAGE DETECTION  
        lang = detect_lang(extracted_prompt_arr[0])
        if lang == 'en':
            final_1, final_2 = '', ''
            for i in range(len(extracted_prompt_arr)):
                    exp = extracted_prompt_arr[i]
                    occurrences_1 = re.finditer('Explanation:', exp)
                    # using reduce() to get start indices of all occurrences
                    start = reduce(lambda x, y: x + [y.start()], occurrences_1, [])
                    occurrences_2 = re.finditer('End of explanation', exp)
                    end = reduce(lambda x, y: x + [y.start()], occurrences_2, [])
                    if len(start) != 0 and len(end) != 0:
                        extracted_exp = exp[start[0] + 12 : end[0]]
                        final_1 = final_1 + extracted_exp + '\n'
            final_2 = final_1
            occurrences_3 = re.finditer('<', final_1)
            start = reduce(lambda x, y: x + [y.start()], occurrences_3, [])   
            occurrences_4 = re.finditer('>', final_1)
            end = reduce(lambda x, y: x + [y.start()], occurrences_4, [])
            if len(start) == len(end) and len(start) != 0:
                for i in range(len(start)):
                    final_2 = final_2.replace(final_1[start[i] : end[i]], '')
            else:
                final_2 = final_1
            final_3 = final_2
            occurrences_5 = re.finditer('<', final_2)
            start = reduce(lambda x, y: x + [y.start()], occurrences_5, [])   
            occurrences_6 = re.finditer('>', final_2)
            end = reduce(lambda x, y: x + [y.start()], occurrences_6, [])
            if len(start) == len(end) and len(start) != 0:
                for i in range(len(start)):
                    final_3 = final_3.replace(final_2[start[i] : end[i]], '')
            else:
                final_3 = final_2
            final = final_3 
            if final != '':
                final_line = ''
                arr = final.split('\n')
                for i in range(len(arr)):
                    if i < 9:
                         if ':' not in arr[i] or '*' not in arr[i] or '>>>' not in arr[i] or '=>' not in arr[i] or '->' not in arr[i] or '>' not in arr[i] or '**' not in arr[i]:
                            final_line = final_line + arr[i] + ' '
                # LANGUAGE DETECTION  2
                lang = detect_lang(final_line)
                if lang == 'en':
                    path = data['path'].split('/')
                    func = path[-1].split('.')
                    final_prompt = add_prompt + f'def {func[0]}():' + '\n\t' + f'"""{final_line}"""'   
                else:
                    final_prompt = None
                lines = extracted_code.split('\n')
                code = ''
                for i in range(len(lines)):
                    if len(lines[i]) != 0:
                        if lines[i][0] != '#' or '--' not in lines[i] or '!' not in lines[i] or '#' not in lines[i]:
                            code = code + lines[i] + '\n'
                        elif 'EOF' in lines[i]:
                            break
                    
            
    else:
        print('Error ^_^')
    
    return{
        'text_prompt' : final_prompt,
        'code_prompt' : code
    }   

In [41]:
prompted_dataset_4 = dataset_4.map(dataset_formation, batched = False, remove_columns = dataset_4['train'].column_names)

prompted_dataset_4



Map:   0%|          | 0/47452 [00:00<?, ? examples/s]

Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^


Map:   0%|          | 0/11864 [00:00<?, ? examples/s]

Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^
Error ^_^


DatasetDict({
    train: Dataset({
        features: ['text_prompt', 'code_prompt'],
        num_rows: 47452
    })
    test: Dataset({
        features: ['text_prompt', 'code_prompt'],
        num_rows: 11864
    })
})

In [42]:
print(prompted_dataset_4['train'][114]['text_prompt'])

<SYSTEM_TASK:>
Solve the following problem using Python, implementing the functions described below, one line at a time
<END_TASK>
<USER_TASK:>
Description:
def popsizes():
	""" Example: modeling changes in population size Simple example Let's look at an example:   A simple bottleneck In order to change population size, one simply has to change the values in the "nlist".   For example, here is a population bottleneck:   Please note the last command, which changes the concatenated array from an array of 64 bit signed integers to 32 bit unsigned integers. Exponential growth """


In [43]:
print(prompted_dataset_4['train'][114]['code_prompt'])

%matplotlib inline
%pylab inline
from __future__ import print_function
import numpy as np
import array
import matplotlib.pyplot as plt
#population size
N=1000
#nlist corresponds to a constant population size for 10N generations
#note the "dtype" argument.  Without it, we'd be defaulting to int64,
#which is a 64-bit signed integer.
nlist=np.array([N]*(10*N),dtype=np.uint32)
#This is a 'view' of the array starting from the beginning:
nlist[0:]
#Evolve for 10N generations,
#bottleneck to 0.25N for 100 generations,
#recover to N for 50 generations
nlist = np.concatenate(([N]*(10*N),[int(0.25*N)]*100,[N]*50)).astype(np.int32)
plt.plot(nlist[0:])
plt.ylim(0,1.5*N)
import math
N2=5*N
tgrowth=500
#G is the growth rate
G = math.exp( (math.log(N2)-math.log(N))/float(tgrowth) )
nlist = np.array([N]*(10*N+tgrowth),dtype=np.uint32)
#Now, modify the list according to expoential growth rate
for i in range(tgrowth):
     nlist[10*N+i] = round( N*math.pow(G,i+1) )
##Now, we see that the population does

In [44]:
final_prompt = prompted_dataset_4['train'][114]['text_prompt']
code = prompted_dataset_4['train'][114]['code_prompt']
inputs = tokenizer(final_prompt, return_tensors = 'pt').to('cuda')
generation_config = GenerationConfig(max_new_tokens=500, temperature= 1.14, do_sample = True, top_p = 3)

output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens = 500,
        generation_config=generation_config,
        pad_token_id = tokenizer.pad_token_id,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{final_prompt}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')
print(dash_line)
print(f'BASELINE HUMAN WRITTEN CODE:\n{code}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
<SYSTEM_TASK:>
Solve the following problem using Python, implementing the functions described below, one line at a time
<END_TASK>
<USER_TASK:>
Description:
def popsizes():
	""" Example: modeling changes in population size Simple example Let's look at an example:   A simple bottleneck In order to change population size, one simply has to change the values in the "nlist".   For example, here is a population bottleneck:   Please note the last command, which changes the concatenated array from an array of 64 bit signed integers to 32 bit unsigned integers. Exponential growth """
---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:

Solve the following problem using Python, implementing the functions described below, one line at a time


Description:
def popsizes():
	""" Example: modeling changes in po

This dataset is OKAYISH so we will take 35% of this dataset and also based on the length of prompt

**Preparing final dataset**

In [45]:
# converting dataset_2 to pandas dataframe
import pandas as pd
df_1 = prompted_dataset_3['test'].to_pandas()

In [46]:
df_2 = prompted_dataset_2['train'].to_pandas()
df_3 = prompted_dataset_2['test'].to_pandas()
df_4 = prompted_dataset_2['validation'].to_pandas()

In [47]:
df_5 = prompted_dataset_4['train'].to_pandas()
df_6 = prompted_dataset_4['test'].to_pandas()

In [48]:
l = int(len(df_5)*0.25)
df_5 = df_5[:l]
df_6 = df_6[:l]

In [49]:
frames = [df_1, df_2,df_3, df_4, df_5, df_6]

df_final = pd.concat(frames, axis = 0, join = 'outer')
df_final.reset_index(inplace = True)
df_final.drop(['index'], axis = 1, inplace = True)
print(len(df_final))
df_final.head(5)

304542


Unnamed: 0,text_prompt,code_prompt
0,<SYSTEM_TASK:>\nSolve the following problem us...,"for idx, elem in enumerate(numbers):\n ..."
1,<SYSTEM_TASK:>\nSolve the following problem us...,result = []\n current_string = []\n ...
2,<SYSTEM_TASK:>\nSolve the following problem us...,return number % 1.0\n
3,<SYSTEM_TASK:>\nSolve the following problem us...,balance = 0\n\n for op in operations:\n...
4,<SYSTEM_TASK:>\nSolve the following problem us...,mean = sum(numbers) / len(numbers)\n re...


In [50]:
df_final.drop_duplicates(inplace = True)
df_final.dropna(inplace = True)
df_final.sample(frac = 1)
df_final.reset_index(inplace = True)
df_final.drop(['index'], axis = 1, inplace = True)

In [51]:
def detect_lang(data):
    lang = detect.predict(data)[0][0]
    if lang == '__label__eng_Latn':
        return 'en'
    else:
        return ''
    
    
for i in range(len(df_final)):
    prompt = df_final['text_prompt'][i]
    prompt = prompt.replace('\n', '')
    prompt = prompt[:2000]
    lang = detect_lang(prompt)
    if lang != 'en':
        df_final.drop([i], axis = 0, inplace = True)
    else:
        pass

In [52]:
print(len(df_final))

297479


In [53]:
split = 0.6
train_df = df_final[:int(len(df_final)*split)]
test_df = df_final[int(len(df_final)*0.6): int(len(df_final)*0.8)]
val_df = df_final[int(len(df_final)*0.8):]

print(len(val_df), len(df_final), len(test_df))

59496 297479 59496


In [54]:
train_df.reset_index(inplace = True)
test_df.reset_index(inplace = True)
val_df.reset_index(inplace = True)

In [55]:
train_df.drop(['index'], axis = 1, inplace = True)
test_df.drop(['index'], axis = 1, inplace = True)
val_df.drop(['index'], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.drop(['index'], axis = 1, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.drop(['index'], axis = 1, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df.drop(['index'], axis = 1, inplace = True)


In [56]:
print(None in train_df, None in test_df, None in val_df)

False False False


In [57]:
train_df.to_csv('train_data.csv', index = False, index_label=None)

In [58]:
test_df.to_csv('test_data.csv', index = False, index_label=None)

In [59]:
val_df.to_csv('validation_data.csv', index = False, index_label=None)