In [3]:
%pip install scikit-learn
%pip install transformers
%pip install sentencepiece
%pip install torch torchvision torchaudio
%pip install accelerate -U
%pip install datasets
%pip install torch
%pip install transformers
%pip install tiktoken

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.24.1
    Uninstalling accelerate-0.24.1:
      Successfully uninstalled accelerate-0.24.1
Successfully installed accelerate-0.25.0
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated package

In [9]:
import torch
from transformers import CodeGenTokenizer, CodeGenForCausalLM
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
class CodeDataset(Dataset):
    def __init__(self, descriptions, codes, tokenizer, max_length=512):
        self.descriptions = descriptions
        self.codes = codes
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, idx):
        description = self.descriptions[idx]
        code = self.codes[idx]
        tokenized_input = self.tokenizer(description, code, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        input_ids = tokenized_input.input_ids.squeeze()
        attention_mask = tokenized_input.attention_mask.squeeze()
        return {"input_ids": input_ids, "attention_mask": attention_mask}


In [8]:

# Load your dataset
df = pd.read_csv('dataset.csv')
descriptions = df['description'].tolist()
codes = df['code'].tolist()


In [9]:

# Split the dataset
desc_train, desc_val, code_train, code_val = train_test_split(descriptions, codes, test_size=0.1)


In [10]:
# Initialize tokenizer
tokenizer = CodeGenTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
tokenizer.pad_token = tokenizer.eos_token


In [11]:
# Create datasets
train_dataset = CodeDataset(desc_train, code_train, tokenizer)
val_dataset = CodeDataset(desc_val, code_val, tokenizer)


In [12]:
model = CodeGenForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")


In [13]:
model.train()


CodeGenForCausalLM(
  (transformer): CodeGenModel(
    (wte): Embedding(51200, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-19): 20 x CodeGenBlock(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): CodeGenMLP(
          (fc_in): Linear(in_features=1024, out_features=4096, bias=True)
          (fc_out): Linear(in_features=4096, out_features=1024, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=51200, bias=True)
)

In [14]:
optimizer = AdamW(model.parameters(), lr=5e-5)




In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

CodeGenForCausalLM(
  (transformer): CodeGenModel(
    (wte): Embedding(51200, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-19): 20 x CodeGenBlock(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): CodeGenMLP(
          (fc_in): Linear(in_features=1024, out_features=4096, bias=True)
          (fc_out): Linear(in_features=4096, out_features=1024, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=51200, bias=True)
)

In [16]:
epochs = 3
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [17]:
for epoch in range(epochs):
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

Epoch: 0, Loss: 9.040763854980469
Epoch: 1, Loss: 0.6241036057472229
Epoch: 2, Loss: 0.4438675343990326


In [18]:
model.save_pretrained("./fine_tuned_codegen")
tokenizer.save_pretrained("./fine_tuned_codegen")


('./fine_tuned_codegen/tokenizer_config.json',
 './fine_tuned_codegen/special_tokens_map.json',
 './fine_tuned_codegen/vocab.json',
 './fine_tuned_codegen/merges.txt',
 './fine_tuned_codegen/added_tokens.json')

In [4]:
model_path = "./fine_tuned_codegen/"  # Path where you saved your fine-tuned model

tokenizer = CodeGenTokenizer.from_pretrained(model_path)
tokenizer.padding_side = "left"


In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [11]:
checkpoint = "Salesforce/codegen-350M-mono"
model = AutoModelForCausalLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [12]:
text = "def two_sum():"
completion = model.generate(**tokenizer(text, return_tensors="pt"))
print(tokenizer.decode(completion[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


def two_sum():
    """
    Given an array of integers, find the two numbers


In [7]:
def generate_code(query, max_length=2048):
    # Encode the input query
    inputs = tokenizer.encode_plus(query, return_tensors="pt", add_special_tokens=True, padding='max_length', max_length=max_length, truncation=True)

    # Generate code
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=1
    )

    # Decode and return the generated code
    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_code


In [8]:
# Example queries
queries = [
    "generate list of top ten most played tracks",
    "most saved tracks from 2019",
]

for query in queries:
    generated_code = generate_code(query)
    print(f"Query: {query}\nGenerated Code:\n{generated_code}\n")


Using pad_token, but it is not set yet.


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
model = AutoModelForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")

text = "def sum_of_two_numbers():"
input_ids = tokenizer(text, return_tensors="pt").input_ids

generated_ids = model.generate(input_ids, max_length=128)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


def sum_of_two_numbers():
    print("Sum of two numbers:")
    print(sum(numbers))

def multiply_two_numbers():
    print("Multiply two numbers:")
    print(multiply(numbers))

def divide_two_numbers():
    print("Divide two numbers:")
    print(divide(numbers))

def print_sum_of_numbers():
    print("Sum of numbers:")
    print(sum(numbers))

def print_product_
