In [None]:
import os
import pickle
import joblib 
import pandas as pd
from tqdm.auto import tqdm
from datasets import load_dataset

def calc_avg_line_length(text):
    lines = text.split('\n')
    line_lengths = [len(line) for line in lines]
    return sum(line_lengths) / len(line_lengths)

def calc_max_line_length(text):
    lines = text.split('\n')
    line_lengths = [len(line) for line in lines]
    return max(line_lengths)

def calc_alphanum_fraction(text):
    if len(text) == 0:
        return 0
    alphanum = sum(c.isalnum() for c in text)
    return alphanum / len(text)

def dataset_from_df(df):
    dataset = {
        'repo_name': [],
        'repo_url': [],
        'repo_description': [],
        'repo_stars': [],
        'repo_forks': [],
        'repo_last_updated': [],
        'repo_created_at': [],
        'repo_size': [],
        'repo_license': [],
        'language': [],
        'text': [],
        'avg_line_length': [],
        'max_line_length': [],
        'alphnanum_fraction': [],
    }
    for i in tqdm(range(len(df))):
        repo = df.iloc[i]
        code = repo['code']
        for programming_language in code:
            code_files = code[programming_language]
            for code_file in code_files:
                text = code_files[code_file]
                dataset['repo_name'].append(repo['name'])
                dataset['repo_url'].append(repo['url'])
                dataset['repo_description'].append(repo['description'])
                dataset['repo_stars'].append(repo['stars'])
                dataset['repo_forks'].append(repo['forks'])
                dataset['repo_last_updated'].append(repo['last_updated'])
                dataset['repo_created_at'].append(repo['created'])
                dataset['repo_size'].append(repo['size'])
                dataset['repo_license'].append(repo['license'])
                dataset['language'].append(programming_language)
                dataset['text'].append(text)
                dataset['avg_line_length'].append(calc_avg_line_length(text))
                dataset['max_line_length'].append(calc_max_line_length(text))
                dataset['alphnanum_fraction'].append(calc_alphanum_fraction(text))
    dataset = pd.DataFrame(dataset)
    return dataset

def huggingface_dataset_from_df(df):
    dataset = dataset_from_df(df)
    with open('hf_ds.pkl', 'wb') as f:
        pickle.dump(dataset, f)
    hf_dataset = load_dataset("pandas", data_files='hf_ds.pkl')
    os.remove('hf_ds.pkl')
    return hf_dataset


In [None]:
joblibs_path = '.././saved_searches'
joblibs = os.listdir(joblibs_path)
joblibs = [f for f in joblibs if f.endswith('.joblib')]
joblibs = [os.path.join(joblibs_path, f) for f in joblibs]
df = joblib.load(joblibs[0])

In [None]:
raw_datasets = huggingface_dataset_from_df(df)

In [None]:
raw_datasets = raw_datasets.filter(lambda example: example['language'] == 'Python')

In [None]:
raw_datasets

In [None]:
from datasets import DatasetDict

# 90% train, 10% test + validation
train_testvalid = raw_datasets['train'].train_test_split(0.1)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(0.5)
# gather everyone if you want to have a single DatasetDict
raw_datasets = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

In [None]:
from transformers import AutoTokenizer

context_length = 128
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")

outputs = tokenizer(
    raw_datasets["train"][:2]["text"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [None]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="gpt2-from-scratch-customizable-code-assistant",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
)

In [None]:
trainer.train()