In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers datasets torch tqdm

In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Check if MPS (Metal Performance Shaders) is available for Apple Silicon; if not, use CUDA or CPU
device = torch.device("mps" if torch.has_mps else "cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")

Using device: cuda


  device = torch.device("mps" if torch.has_mps else "cuda" if torch.cuda.is_available() else "cpu")


In [3]:
# Load dataset
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca")['train']

# Sample data
print("Sample Data:")
print(dataset[0])


Downloading readme:   0%|          | 0.00/905 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 11.4M/11.4M [00:00<00:00, 38.9MB/s]


Generating train split:   0%|          | 0/18612 [00:00<?, ? examples/s]

Sample Data:
{'instruction': 'Create a function to calculate the sum of a sequence of integers.', 'input': '[1, 2, 3, 4, 5]', 'output': '# Python code\ndef sum_sequence(sequence):\n  sum = 0\n  for num in sequence:\n    sum += num\n  return sum', 'prompt': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a function to calculate the sum of a sequence of integers.\n\n### Input:\n[1, 2, 3, 4, 5]\n\n### Output:\n# Python code\ndef sum_sequence(sequence):\n  sum = 0\n  for num in sequence:\n    sum += num\n  return sum'}


In [4]:
# Function to format data for training
def format_data(instruction, input_text, output_text):
    user_prompt = f"Instruction: {instruction}\nInput: {input_text}"
    assistant_response = f"Output: {output_text}"
    return user_prompt, assistant_response

# Function to format prompts
def format_prompt(messages):
    return "\n".join([msg['content'] for msg in messages])

# Formatting dataset
formatted_data = [format_data(item['instruction'], item['input'], item['output']) for item in dataset]
print("Formatted Data Sample:")
print(formatted_data[0])


Formatted Data Sample:
('Instruction: Create a function to calculate the sum of a sequence of integers.\nInput: [1, 2, 3, 4, 5]', 'Output: # Python code\ndef sum_sequence(sequence):\n  sum = 0\n  for num in sequence:\n    sum += num\n  return sum')


In [5]:
# Load GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium").to(device)

# Adding special tokens
tokenizer.add_special_tokens({'pad_token': '[PAD]'})


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

1

In [6]:
# Create dataset class
class DatasetClass(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# Encode data
train_encodings = tokenizer([f"{q} {tokenizer.eos_token} {a}" for q, a in formatted_data], truncation=True, padding=True)

# Create DataLoader
train_dataset = DatasetClass(train_encodings)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)


In [9]:

# Function to format data for training
def format_data(instruction, input_text, output_text):
    user_prompt = f"Instruction: {instruction}\nInput: {input_text}"
    assistant_response = f"Output: {output_text}"
    return user_prompt, assistant_response

# Function to format prompts
def format_prompt(messages):
    return "\n".join([msg['content'] for msg in messages])

# Formatting dataset
formatted_data = [format_data(item['instruction'], item['input'], item['output']) for item in dataset]
print("Formatted Data Sample:")
print(formatted_data[0])

# Load GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium").to(device)

# Adding special tokens
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

# Create dataset class
class DatasetClass(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# Encode data
train_encodings = tokenizer([f"{q} {tokenizer.eos_token} {a}" for q, a in formatted_data], truncation=True, padding=True, return_tensors='pt')

# Check the encoding output to ensure indices are in range
print("Sample Encoded Data:")
print(train_encodings.input_ids[0])

# Create DataLoader
train_dataset = DatasetClass(train_encodings)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Fine-tuning parameters
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 3
accumulation_steps = 8

model.train()
for epoch in range(epochs):
    epoch_loss = 0
    for i, batch in enumerate(tqdm(train_loader), start=1):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss = loss / accumulation_steps
        loss.backward()
        epoch_loss += loss.item()
        
        if (i % accumulation_steps) == 0:
            optimizer.step()
            optimizer.zero_grad()
        
        # Free up memory
        del input_ids, attention_mask, outputs, loss
        torch.cuda.empty_cache()
    
    print(f"Epoch {epoch + 1}/{epochs} Loss: {epoch_loss:.4f}")

# Put the model in evaluation mode
model.eval()

# Function to generate code based on a prompt
def generate_code(prompt, max_length=125):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(inputs['input_ids'], max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Testing the model
test_prompt = "Write me a python function that adds 2 numbers together."
generated_code = generate_code(test_prompt)
print(f"Generated Code:\n{generated_code}")

# Function for interactive code generation
def interactive_code_generation():
    while True:
        prompt = input("Enter instruction (or 'exit' to stop): ")
        if prompt.lower() == 'exit':
            break
        generated_code = generate_code(prompt)
        print(f"Generated Code:\n{generated_code}\n")

interactive_code_generation()

Formatted Data Sample:
('Instruction: Create a function to calculate the sum of a sequence of integers.\nInput: [1, 2, 3, 4, 5]', 'Output: # Python code\ndef sum_sequence(sequence):\n  sum = 0\n  for num in sequence:\n    sum += num\n  return sum')


OutOfMemoryError: CUDA out of memory. Tried to allocate 198.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 193.06 MiB is free. Process 3971 has 14.56 GiB memory in use. Of the allocated memory 14.39 GiB is allocated by PyTorch, and 41.44 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF