In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
! pip install datasets
! pip install torch



In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

# special tokens for prompting
system_token = "<SYSTEM_TASK:>"
user_token = "<USER_TASK:>"
assistant_token = "<ASSISTANT_TASK:>"
end_token = "<END_TASK>"


model_checkpoint = 'codeparrot/codeparrot-small'
# model_checkpoint = "codeparrot/codeparrot"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,
                                          additional_special_tokens = ["<SYSTEM_TASK:>", "<USER_TASK:>", "<ASSISTANT_TASK:>", "<END_TASK>"],
                                          pad_token = "<PAD>",
                                          )

Downloading (…)okenizer_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/497k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/277k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/840k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
tokenizer


GPT2TokenizerFast(name_or_path='codeparrot/codeparrot-small', vocab_size=32768, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<PAD>', 'additional_special_tokens': ['<SYSTEM_TASK:>', '<USER_TASK:>', '<ASSISTANT_TASK:>', '<END_TASK>']}, clean_up_tokenization_spaces=True)

In [4]:
special_token_dict = tokenizer.special_tokens_map
print(special_token_dict)

{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<PAD>', 'additional_special_tokens': ['<SYSTEM_TASK:>', '<USER_TASK:>', '<ASSISTANT_TASK:>', '<END_TASK>']}


In [5]:
tokenizer.add_special_tokens(special_token_dict)


0

In [6]:
context_length = 10000

In [7]:
import numpy as np
from transformers import BitsAndBytesConfig, AutoConfig, AutoModelForCausalLM, AutoModelWithLMHead
import torch
from accelerate import init_empty_weights, infer_auto_device_map

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [8]:
config = AutoConfig.from_pretrained(model_checkpoint,
                                    vocab_size=tokenizer.vocab_size,
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
)

with init_empty_weights():
    model = AutoModelForCausalLM.from_config(config)
    
print(model)

Downloading (…)lve/main/config.json:   0%|          | 0.00/903 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(32768, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=32768, bias=False)
)


In [9]:
device_map = infer_auto_device_map(model, no_split_module_classes = ['GPT2Block'])
print(device_map)

{'': 0}


In [10]:
import torch 

nf4_config = BitsAndBytesConfig(
   load_in_2bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)


# model_id = "codeparrot/codeparrot"
model_id = 'codeparrot/codeparrot-small'
model =AutoModelForCausalLM.from_pretrained(model_id,
                                            config = config,
                                            device_map="auto",
                                            quantization_config=nf4_config,
                                            torch_dtype=torch.bfloat16)

Downloading pytorch_model.bin:   0%|          | 0.00/457M [00:00<?, ?B/s]

In [11]:
model.resize_token_embeddings(len(tokenizer))

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32773. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(32773, 768)

In [12]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 111012096
all model parameters: 111012096
percentage of trainable model parameters: 100.00%


In [13]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(32773, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=32773, bias=False)
)


In [14]:
text_prompt = 'def return_files_size(filename):'
code =     'return os.path.getsize(filepath)'

inputs = tokenizer(text_prompt, return_tensors = 'pt')

output = model.generate(**inputs,
                       max_new_tokens = 50)

output = tokenizer.batch_decode(output,
                          skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{text_prompt}')
print(dash_line)
print(f'BASELINE HUMAN PYTHON CODE:\n{code}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output[0]}')

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


---------------------------------------------------------------------------------------------------
INPUT PROMPT:
def return_files_size(filename):
---------------------------------------------------------------------------------------------------
BASELINE HUMAN PYTHON CODE:
return os.path.getsize(filepath)

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
def return_files_size(filename):
    """
    Return the size of the file in bytes.
    """
    return os.path.getsize(filename)


def return_file_size(filename):
    """
    Return the size of the file in bytes.
    """
   


In [15]:
from transformers import GenerationConfig
# generation_config = GenerationConfig(max_new_tokens=100, temperature=0.1)

text_prompt = 'def return_files_size(filename):'
code =     'return os.path.getsize(filepath)'

inputs = tokenizer(text_prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens = 100,
#         generation_config=generation_config,
        pad_token_id = tokenizer.pad_token_id,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{text_prompt}')
print(dash_line)
print(f'BASELINE HUMAN PYTHON CODE:\n{code}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
def return_files_size(filename):
---------------------------------------------------------------------------------------------------
BASELINE HUMAN PYTHON CODE:
return os.path.getsize(filepath)

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
def return_files_size(filename):
    """
    Return the size of the file in bytes.
    """
    return os.path.getsize(filename)


def return_file_size(filename):
    """
    Return the size of the file in bytes.
    """
    return os.path.getsize(filename)


def return_file_size_in_bytes(filename):
    """
    Return the size of the file in bytes.
    """
    return os.path.getsize(filename)





In [16]:
# text_prompt = '#Image Classsification using VGG16 model\ndef VGG16_model_architecture(): #making prediction using the model\ndef model_predict():'

text_prompt = 'def VGG16_model_architecture()\n"""\nInitialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function\n"""\n\bdef train_test_split():\n\t"""Split the data X, Y in to train and test data using sklearn"""\n\bdef model_predict():\n\t"""Make prediction using the deep learning model defined above in VGG16_model_architecture function"""\n'

generation_config = GenerationConfig(max_new_tokens=100, temperature=0.1)

inputs = tokenizer(text_prompt, return_tensors = 'pt')

output = model.generate(**inputs,
                       max_new_tokens = 150,
                       generation_config=generation_config,
                       pad_token_id = tokenizer.pad_token_id,)

output = tokenizer.batch_decode(output,
                          skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{text_prompt}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output[0]}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
def VGG16_model_architecture()
"""
Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function
"""
def train_test_split():
	"""Split the data X, Y in to train and test data using sklearn"""
def model_predict():
	"""Make prediction using the deep learning model defined above in VGG16_model_architecture function"""

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
def VGG16_model_architecture()
"""
Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function
"""
def train_test_split():
	"""Split the data X, Y in to train and test data using sklearn"""
def model_predict():
	"""Make prediction using the deep learning model defined abo

In [17]:
# text_prompt = '#Image Classsification using VGG16 model\ndef VGG16_model_architecture(): #making prediction using the model\ndef model_predict():'

text_prompt = 'def VGG16_model_architecture()\n"""\nInitialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function\n"""\n\bdef train_test_split():\n\t"""Split the data X, Y in to train and test data using sklearn"""\n\bdef model_predict():\n\t"""Make prediction using the deep learning model defined above in VGG16_model_architecture function"""\n'

inputs = tokenizer(text_prompt, return_tensors = 'pt')

output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens = 200,
        pad_token_id = tokenizer.pad_token_id,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{text_prompt}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
def VGG16_model_architecture()
"""
Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function
"""
def train_test_split():
	"""Split the data X, Y in to train and test data using sklearn"""
def model_predict():
	"""Make prediction using the deep learning model defined above in VGG16_model_architecture function"""

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
def VGG16_model_architecture()
"""
Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function
"""
def train_test_split():
	"""Split the data X, Y in to train and test data using sklearn"""
def model_predict():
	"""Make prediction using the deep learning model defined abo

Best Prompt

In [18]:
add_prompt = '<SYSTEM_TASK:>\nGiven the following code description, write Python code to implement the functionality described below\n<END_TASK>\n<USER_TASK:>\nDescription:\n'

text_prompt = 'def VGG16_model_architecture()\n"""\nInitialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function\n"""\n\bdef train_test_split():\n\t"""Split the data X, Y in to train and test data using sklearn"""\n\bdef model_predict():\n\t"""Make prediction using the deep learning model defined above in VGG16_model_architecture function"""\n'

final_prompt = add_prompt + text_prompt

inputs = tokenizer(final_prompt, return_tensors = 'pt')

output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens = 350,
        pad_token_id = tokenizer.pad_token_id,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{text_prompt}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
def VGG16_model_architecture()
"""
Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function
"""
def train_test_split():
	"""Split the data X, Y in to train and test data using sklearn"""
def model_predict():
	"""Make prediction using the deep learning model defined above in VGG16_model_architecture function"""

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:

Given the following code description, write Python code to implement the functionality described below


Description:
def VGG16_model_architecture()
"""
Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function
"""
def train_test_split():
	"""Split the data X, Y in to t

In [19]:
add_prompt = "<SYSTEM_TASK:>\nCodeParrot, I'd like you to write a Python code snippet to implement the functionality described below\n<END_TASK>\n<USER_TASK:>\nDescription:\n"


text_prompt = 'def VGG16_model_architecture()\n"""\nInitialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function\n"""\n\bdef train_test_split():\n\t"""Split the data X, Y in to train and test data using sklearn"""\n\bdef model_predict():\n\t"""Make prediction using the deep learning model defined above in VGG16_model_architecture function"""\n'

final_prompt = add_prompt + text_prompt

inputs = tokenizer(final_prompt, return_tensors = 'pt')

output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens = 200,
        pad_token_id = tokenizer.pad_token_id,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{text_prompt}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
def VGG16_model_architecture()
"""
Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function
"""
def train_test_split():
	"""Split the data X, Y in to train and test data using sklearn"""
def model_predict():
	"""Make prediction using the deep learning model defined above in VGG16_model_architecture function"""

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:

CodeParrot, I'd like you to write a Python code snippet to implement the functionality described below


Description:
def VGG16_model_architecture()
"""
Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function
"""
def train_test_split():
	"""Split the data X, Y in to t

In [20]:
add_prompt = "<SYSTEM_TASK:>\nCodeParrot, I'd like you to write a Python code snippet to implement the functionality described below\n<END_TASK>\n<USER_TASK:>\nDescription:\n"


text_prompt = 'def Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function():\n\t\n\bdef Split the data X, Y in to train and test data using sklearn():\n\t\n\bdef Make prediction using the deep learning model defined above in VGG16_model_architecture function():\n'

final_prompt = text_prompt

inputs = tokenizer(final_prompt, return_tensors = 'pt')

output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        max_new_tokens = 200,
        pad_token_id = tokenizer.pad_token_id,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{text_prompt}')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
def Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function():
	
def Split the data X, Y in to train and test data using sklearn():
	
def Make prediction using the deep learning model defined above in VGG16_model_architecture function():

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
def Initialize VGG16 model a deep learning model trained on imagenet for performing Image Classification in the VGG16_model_architecture function():
	
def Split the data X, Y in to train and test data using sklearn():
	
def Make prediction using the deep learning model defined above in VGG16_model_architecture function():

def Make prediction using the deep learning model defined above in VGG16_model_architecture function(