In [None]:
!pip install transformers --upgrade

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes
# !pip install accelerate bitsandbytes
# !pip install --upgrade accelerate
# !pip install -U bitsandbytes-cuda111
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# to use 4bit use `load_in_4bit=True` instead
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

checkpoint = "bigcode/starcoder2-3b"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=quantization_config)

In [None]:
class CodingProblem:
    def __init__(self, problem_statement, solution):
        self.problem_statement = problem_statement
        self.solution = solution

In [None]:
import json

def load_format_1(file_path):
    problems = []
    with open(file_path, 'r') as file:
        data_list = json.load(file)
        for data in data_list:
            problem_statement = data.get("instruction", "") + " " + data.get("input", "")
            solution = data.get("output", "")
            problem = CodingProblem(problem_statement=problem_statement, solution=solution)
            problems.append(problem)
    return problems

In [None]:
file_path1 = '/content/drive/MyDrive/StarCoder/tigerbot-kaggle-leetcodesolutions-en-2k.json'
problems = load_format_1(file_path1)

In [None]:
def load_format_2(file_path):
    new_problems = []
    with open(file_path, 'r') as file:
        data_list = json.load(file)  # Assuming this is also a list of entries
        for data in data_list:
            # Concatenate relevant parts to form a problem statement
            problem_statement = data.get("title", "") + " " + data.get("algo_input", "")
            # Choose one of the solutions. Here, I'm arbitrarily choosing the Python solution.
            solution = data.get("solution_py", "") + " " + data.get("solution_js", "") + " " + data.get("solution_java", "") + " " + data.get("solution_c", "")
            problem = CodingProblem(problem_statement=problem_statement, solution=solution)
            new_problems.append(problem)
    return new_problems

In [None]:
file_path2 = '/content/drive/MyDrive/StarCoder/train.json'
new_problems = load_format_2(file_path2)

# Extend the original problems list with the new problems
problems.extend(new_problems)

In [None]:
file_path3 = '/content/drive/MyDrive/StarCoder/evaluation.json'
new_problems1 = load_format_2(file_path3)

# Extend the original problems list with the new problems
problems.extend(new_problems1)

In [None]:
from torch.utils.data import Dataset

class CodingProblemsDataset(Dataset):
    def __init__(self, problems, tokenizer, max_length=512):
        self.problems = problems
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.problems)

    def __getitem__(self, idx):
        problem = self.problems[idx]
        encoding = self.tokenizer(
            f"Problem: {problem.problem_statement} Solution: {problem.solution}",
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}  # Remove the batch dimension
        return encoding


In [None]:
from torch.utils.data import DataLoader

# Instantiate your dataset
dataset = CodingProblemsDataset(problems, tokenizer, max_length=512)

# Use DataLoader to handle batching and memory management efficiently
data_loader = DataLoader(dataset, batch_size=4, shuffle=True)  # Adjust batch_size based on your RAM


In [None]:
# Set the padding token to `eos_token` if `pad_token` is not defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Proceed with your data generation and training loop


In [None]:
def data_generator(problems, tokenizer, batch_size=4, max_length=512):
    for i in range(0, len(problems), batch_size):
        batch_problems = problems[i:i + batch_size]
        batch_encodings = tokenizer(
            [f"Problem: {p.problem_statement} Solution: {p.solution}" for p in batch_problems],
            truncation=True,
            padding=True,
            max_length=max_length,
            return_tensors='pt'
        )
        yield batch_encodings

# Use the generator during training
for batch_encodings in data_generator(problems, tokenizer):
    # Move tensors to GPU
    batch_encodings = {k: v.to('cuda') for k, v in batch_encodings.items()}
    # Continue with your training step


In [None]:
!nvidia-smi

In [None]:
import torch
import json
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_linear_schedule_with_warmup

# Ensure GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the CodingProblem class and data loading functions (load_format_1 and load_format_2) here...

# Load and combine data from multiple sources
file_path1 = '/content/drive/MyDrive/StarCoder/tigerbot-kaggle-leetcodesolutions-en-2k.json'
problems_format_1 = load_format_1(file_path1)

file_path2 = '/content/drive/MyDrive/StarCoder/train.json'
problems_format_2 = load_format_2(file_path2)

file_path3 = '/content/drive/MyDrive/StarCoder/evaluation.json'
problems_format_3 = load_format_2(file_path3)

all_problems = problems_format_1 + problems_format_2 + problems_format_3

# Load tokenizer and model
checkpoint = "bigcode/starcoder2-3b"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

# If tokenizer does not have a pad token, set it to eos_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Create the dataset and DataLoader using the combined list of problems
dataset = CodingProblemsDataset(all_problems, tokenizer, max_length=512)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True, num_workers=4, pin_memory=True)

# Setup optimizer and scheduler (define num_epochs and other hyperparameters as needed)
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(data_loader) * num_epochs)

import gc

# Clear unused variables
del problems_format_1, problems_format_2, problems_format_3, all_problems
gc.collect()  # Python's garbage collector

# PyTorch's cache clearing (helpful if moving tensors between CPU and GPU)
torch.cuda.empty_cache()


# Training loop
model.train()
for epoch in range(num_epochs):
    for batch in data_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch, labels=batch['input_ids'])
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        print(f"Epoch: {epoch}, Loss: {loss.item()}")

# Save the model
model.save_pretrained('/content/drive/MyDrive/StarCoder')


In [None]:
# Define the path where your fine-tuned model and tokenizer are saved
model_path = '/content/drive/MyDrive/StarCoder'

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Ensure the model is in evaluation mode
model.eval()

# Function to generate text using the model
def generate_text(prompt, max_length=50):
    # Tokenize the prompt text
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    # Generate text using the model
    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)[0]

    # Decode the generated text
    generated_text = tokenizer.decode(output_ids, skip_special_tokens=True)

    return generated_text

# Example usage
prompt = "Define a function in Python that calculates the sum of two numbers:"

# Generate text
generated_solution = generate_text(prompt)

print("Generated Solution:")
print(generated_solution)