In [None]:
import os
import pickle
import joblib 
import pandas as pd
from tqdm.auto import tqdm
from datasets import load_dataset

def calc_avg_line_length(text):
    lines = text.split('\n')
    line_lengths = [len(line) for line in lines]
    return sum(line_lengths) / len(line_lengths)

def calc_max_line_length(text):
    lines = text.split('\n')
    line_lengths = [len(line) for line in lines]
    return max(line_lengths)

def calc_alphanum_fraction(text):
    if len(text) == 0:
        return 0
    alphanum = sum(c.isalnum() for c in text)
    return alphanum / len(text)

def dataset_from_df(df):
    dataset = {
        'repo_name': [],
        'repo_url': [],
        'repo_description': [],
        'repo_stars': [],
        'repo_forks': [],
        'repo_last_updated': [],
        'repo_created_at': [],
        'repo_size': [],
        'repo_license': [],
        'language': [],
        'text': [],
        'avg_line_length': [],
        'max_line_length': [],
        'alphnanum_fraction': [],
    }
    for i in tqdm(range(len(df))):
        repo = df.iloc[i]
        code = repo['code']
        for programming_language in code:
            code_files = code[programming_language]
            for code_file in code_files:
                text = code_files[code_file]
                dataset['repo_name'].append(repo['name'])
                dataset['repo_url'].append(repo['url'])
                dataset['repo_description'].append(repo['description'])
                dataset['repo_stars'].append(repo['stars'])
                dataset['repo_forks'].append(repo['forks'])
                dataset['repo_last_updated'].append(repo['last_updated'])
                dataset['repo_created_at'].append(repo['created'])
                dataset['repo_size'].append(repo['size'])
                dataset['repo_license'].append(repo['license'])
                dataset['language'].append(programming_language)
                dataset['text'].append(text)
                dataset['avg_line_length'].append(calc_avg_line_length(text))
                dataset['max_line_length'].append(calc_max_line_length(text))
                dataset['alphnanum_fraction'].append(calc_alphanum_fraction(text))
    dataset = pd.DataFrame(dataset)
    return dataset

def huggingface_dataset_from_df(df):
    dataset = dataset_from_df(df)
    with open('hf_ds.pkl', 'wb') as f:
        pickle.dump(dataset, f)
    hf_dataset = load_dataset("pandas", data_files='hf_ds.pkl')
    os.remove('hf_ds.pkl')
    return hf_dataset


In [None]:
joblibs_path = '.././saved_searches'
joblibs = os.listdir(joblibs_path)
joblibs = [f for f in joblibs if f.endswith('.joblib')]
joblibs = [os.path.join(joblibs_path, f) for f in joblibs]
df = joblib.load(joblibs[0])
raw_datasets = huggingface_dataset_from_df(df)


### ========================================================== ========================================================== ========================================================== ==========================================================

In [2]:
from  datasets  import  load_dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import GPT2LMHeadModel, AutoConfig, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments





# Downloading and loading a dataset from the hub.
raw_datasets = load_dataset("ammarnasr/Customizable-Code-Assistant-Data")

# Filtering the dataset to only Python examples.
raw_datasets = raw_datasets.filter(lambda example: example['language'] == 'Python')

# Splitting the dataset into train, test, and validation sets.
train_testvalid = raw_datasets['train'].train_test_split(0.1)
test_valid = train_testvalid['test'].train_test_split(0.5)
raw_datasets = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})


# Tokenizing the dataset
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")
context_length = 128
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}
tokenized_datasets = raw_datasets.map(tokenize, batched=True, remove_columns=raw_datasets["train"].column_names)
print(f"Tokenized dataset: {tokenized_datasets['train'].num_rows} training samples")
      
# Preparing the model
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

# Preparing the data collator
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Preparing the training arguments
args = TrainingArguments(
    output_dir="gpt2-from-scratch-customizable-code-assistant",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
)


Found cached dataset parquet (C:/Users/Edin/.cache/huggingface/datasets/ammarnasr___parquet/ammarnasr--Customizable-Code-Assistant-Data-3467676a1c1517c6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\Edin\.cache\huggingface\datasets\ammarnasr___parquet\ammarnasr--Customizable-Code-Assistant-Data-3467676a1c1517c6\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-f6cd86854f64fec7.arrow


Map:   0%|          | 0/475 [00:00<?, ? examples/s]

Map:   0%|          | 0/27 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Tokenized dataset: 2713 training samples
GPT-2 size: 124.2M parameters


In [27]:
# Transformer Math : 
# basic math related to computation and memory usage for transformers

# 1. Information About Local Accelerator (GPU/CPU)
# This includes all the available devices and each one's :
# Name, Compute Capability, Multiprocessors, CUDA Cores, Concurrent threads, GPU clock, Memory clock, Total Memory and Free Memory.
from cuda_utils import  main, gpu_flops
import json
cuda_info = main()
print(json.dumps(cuda_info,  indent=4))

# 2. Compute Requirements
# The basic equation giving the cost to train a transformer model is given by:
#                                    C = tao*T = 6*P*D
# where:
# C is the compute required to train the transformer model, in total floating point operations (FLOPs)
# C = C_forward + C_backward
# C_forward  =  2*P*D
# C_backward =  4*P*D
# tao is the aggregate throughput of your hardware setup (tao = (No. of GPUs) * (Actual FLOPs/GPU), in FLOPs
# T is the time spent training the model, in seconds
# P is the number of parameters in the transformer model
# D is the dataset size, in tokens

#3. Estimating GPU Actual FLOPs (tao)
# Estimating GPU FLOPs accurately can be challenging due to the complexity of GPU architectures and optimizations. However, here is a simple formula to estimate a GPU FLOP/s:
# Total GPU FLOPS/s = GPU clock * cores * flops_per_clock_cycle * fp_precision
for gpu_no in cuda_info:
    gpu = cuda_info[gpu_no]
    print(f"GPU: {gpu['Name']}")
    print(f"GPU FLOPS: {gpu_flops(gpu):.2f} TFLOPS")
    tao = gpu_flops(gpu) # TFLOPS

#4. Estimating Model Parameters (P)
P = model.num_parameters()
print(f"Model Parameters: {P/1000**2:.2f}M")

#5. Estimating Dataset Size (D)
D = 0
for input_ids in tokenized_datasets["train"]["input_ids"]:
    D += len(input_ids)
print(f"Dataset Size: {D} tokens")

#6. Estimating Training Time (T)
# Calculate training time based on the formula tao*T = 6*P*D. Note that ao needs to be in FLOPS/s not TFLOPS/s, to convert TFLOPS/s to FLOPS/s multiply by 1000**3.
T = 6*P*D/(tao*1000**3)
print(f"Training Time: {T/3600:.2f} hours")


#7. Parameter vs Dataset Tradeoffs
# Although strictly speaking you can train a transformer for as many tokens as you like, the number of tokens trained can highly impact both the computing costs and the final model performance making striking the right balance important.
# compute optimal language model has a number of parameters and a dataset size that satisfies the approximation D = 20*P.
optimal_D = 20*P
print(f"Optimal Dataset Size: {optimal_D / 1000**2:.2f}M tokens ({optimal_D / D:.2f}x current dataset size)")





Found 1 device(s).
Device: 0
  Name: NVIDIA GeForce GTX 1650
  Compute Capability: 7.5
  Multiprocessors: 14
  CUDA Cores: 896
  CUDA Architecture: Turing
  FP64 ops per cycle: 2
  FP32 ops per cycle: 64
  FP16 ops per cycle: 128
  INT8 ops per cycle: 256
  Concurrent threads: 14336
  GPU clock: 1515 MHz
  Memory clock: 6001 MHz
  Total Memory: 4095 MiB
  Free Memory: 2733 MiB
{
    "device_0": {
        "Name": "NVIDIA GeForce GTX 1650",
        "Compute Capability": "7.5",
        "Multiprocessors": 14,
        "CUDA Cores": 896,
        "CUDA Architecture": "Turing",
        "Ops per cycle": {
            "FP64": 2,
            "FP32": 64,
            "FP16": 128,
            "INT8": 256
        },
        "Concurrent threads": 14336,
        "GPU clock": 1515.0,
        "Memory clock": 6001.0,
        "Total Memory (MiB)": 4095.6875,
        "Free Memory (MiB)": 2733.7000007629395
    }
}
GPU: NVIDIA GeForce GTX 1650
GPU FLOPS: 2.71 TFLOPS
Model Parameters: 124.24M
Dataset Size: 34

In [14]:
len(tokenized_datasets['train']['input_ids'][2])

128

In [15]:
128 * len(tokenized_datasets['train'])

347264

In [19]:
for i in tokenized_datasets['train']:
    print(i)
    break

{'input_ids': [973, 6661, 14, 8528, 14, 1353, 978, 549, 63, 875, 63, 961, 173, 973, 6661, 14, 5588, 978, 3744, 173, 973, 6661, 14, 1824, 978, 7252, 13076, 173, 173, 973, 2956, 63, 15766, 978, 1519, 173, 973, 2956, 63, 15766, 14, 1824, 978, 2002, 3002, 4391, 173, 20245, 63, 17370, 63, 5362, 3686, 63, 3220, 233, 3744, 439, 3997, 26, 22450, 13, 375, 528, 173, 20245, 63, 17370, 63, 5362, 3686, 63, 3220, 233, 3744, 8, 232, 333, 3997, 26, 22450, 13, 5500, 485, 922, 10426, 63, 293, 7095, 4063, 63, 22450, 7568, 173, 9, 4391, 173, 692, 16544, 7755, 6252, 83, 5791, 22265, 8, 40674, 274, 312, 509, 562, 6134, 8, 248, 274, 222, 272, 14, 1318, 233, 2002, 3002, 323, 312, 509, 1737, 63, 4092, 63]}


In [20]:
from tqdm.auto import tqdm
for i in tqdm(tokenized_datasets['train'], total=len(tokenized_datasets['train'])):
    x= len(i['input_ids'])
    if x != 128:
        print(x)

  0%|          | 0/2713 [00:00<?, ?it/s]