In [None]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer_og = AutoTokenizer.from_pretrained("Salesforce/codegen2-1B")
model_og = AutoModelForCausalLM.from_pretrained("Salesforce/codegen2-1B", trust_remote_code=True, revision="main")
model_og = model_og.to(device)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer_ft = AutoTokenizer.from_pretrained("ammarnasr/CodeGen2_1B_merged")
model_ft = AutoModelForCausalLM.from_pretrained("ammarnasr/CodeGen2_1B_merged", trust_remote_code=True, revision="main")
model_ft = model_ft.to(device)

In [None]:
security_problems = [
    "def check_password_strength(password: str) -> bool:\n    '''Check the strength of a password based on certain criteria (e.g., length, complexity).'''\n",
    "def sanitize_input(input_str: str) -> str:\n    '''Implement a function to sanitize user input to prevent XSS (Cross-Site Scripting) attacks.'''\n",
    "def prevent_sql_injection(query: str) -> str:\n    '''Implement a function to sanitize SQL queries and prevent SQL injection attacks.'''\n"
]

for i in range(3):
  text = security_problems[i]
  input_ids_og = tokenizer_og(text, return_tensors="pt").input_ids.to(device)
  input_ids_ft = tokenizer_ft(text, return_tensors="pt").input_ids.to(device)

  generated_ids_og = model_og.generate(input_ids_og, max_length=128)
  generated_ids_ft = model_ft.generate(input_ids_ft, max_length=128)

  out_og = tokenizer_og.decode(generated_ids_og[0], skip_special_tokens=True)
  out_ft = tokenizer_ft.decode(generated_ids_ft[0], skip_special_tokens=True)


  print('-'*80)
  print('OG:')
  print(out_og)
  print()
  print('FT:')
  print(out_ft)
  print('-'*80)

In [None]:
config = {
    "max_new_tokens": 60,
}

In [None]:
from typing import Any, Dict, List
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] ==8 else torch.float16

class EndpointHandler:
    def __init__(self, path=""):
        self.tokenizer = AutoTokenizer.from_pretrained(path)
        self.model = AutoModelForCausalLM.from_pretrained(path, trust_remote_code=True, revision="main")
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)


    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
        prompt = data["inputs"]
        if "config" in data:
          config = data.pop("config", None)
        else:
          config = {'max_new_tokens':100}
        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
        generated_ids = self.model.generate(input_ids, **config)
        generated_text = self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        return [{"generated_text": generated_text}]
        
            
        


In [4]:
import requests

API_URL = "https://api-inference.huggingface.co/models/ammarnasr/CodeGen2_1B_merged"
headers = {"Authorization": "Bearer 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"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
prompt = """
def prevent_sql_injection(query: str) -> str:
    '''Implement a function to sanitize SQL queries and prevent SQL injection attacks.'''
"""

output = query({
	"inputs": prompt
})

print(output[0]['generated_text'])


def prevent_sql_injection(query: str) -> str:
    '''Implement a function to sanitize SQL queries and prevent SQL injection attacks.'''
(),
�/orequesthasK


In [13]:
import requests

API_URL = "https://q02qcgkwagbft8mt.us-east-1.aws.endpoints.huggingface.cloud"
headers = {
	"Authorization": "",
	"Content-Type": "application/json"
}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	

prompt = """
def prevent_sql_injection(query: str) -> str:
    '''Prevent SQL injection attacks.'''
"""
output = query({
	"inputs":prompt,
})

In [14]:
print(output)


def prevent_sql_injection(query: str) -> str:
    '''Prevent SQL injection attacks.'''
    


In [6]:
print(output[0]['generated_text'])

TypeError: string indices must be integers

In [None]:
from  datasets  import  load_dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import GPT2LMHeadModel, AutoConfig, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments





# Downloading and loading a dataset from the hub.
raw_datasets = load_dataset("ammarnasr/Customizable-Code-Assistant-Data")

# Filtering the dataset to only Python examples.
raw_datasets = raw_datasets.filter(lambda example: example['language'] == 'Python')

# Splitting the dataset into train, test, and validation sets.
train_testvalid = raw_datasets['train'].train_test_split(0.1)
test_valid = train_testvalid['test'].train_test_split(0.5)
raw_datasets = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})


# Tokenizing the dataset
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")
context_length = 128
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}
tokenized_datasets = raw_datasets.map(tokenize, batched=True, remove_columns=raw_datasets["train"].column_names)
print(f"Tokenized dataset: {tokenized_datasets['train'].num_rows} training samples")
      
# Preparing the model
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

# Preparing the data collator
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Preparing the training arguments
args = TrainingArguments(
    output_dir="gpt2-from-scratch-customizable-code-assistant",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
)


In [None]:
# Transformer Math : 
# basic math related to computation and memory usage for transformers

# 1. Information About Local Accelerator (GPU/CPU)
# This includes all the available devices and each one's :
# Name, Compute Capability, Multiprocessors, CUDA Cores, Concurrent threads, GPU clock, Memory clock, Total Memory and Free Memory.
from cuda_utils import  main, gpu_flops
import json
cuda_info = main()
print(json.dumps(cuda_info,  indent=4))

# 2. Compute Requirements
# The basic equation giving the cost to train a transformer model is given by:
#                                    C = tao*T = 6*P*D
# where:
# C is the compute required to train the transformer model, in total floating point operations (FLOPs)
# C = C_forward + C_backward
# C_forward  =  2*P*D
# C_backward =  4*P*D
# tao is the aggregate throughput of your hardware setup (tao = (No. of GPUs) * (Actual FLOPs/GPU), in FLOPs
# T is the time spent training the model, in seconds
# P is the number of parameters in the transformer model
# D is the dataset size, in tokens

#3. Estimating GPU Actual FLOPs (tao)
# Estimating GPU FLOPs accurately can be challenging due to the complexity of GPU architectures and optimizations. However, here is a simple formula to estimate a GPU FLOP/s:
# Total GPU FLOPS/s = GPU clock * cores * flops_per_clock_cycle * fp_precision
for gpu_no in cuda_info:
    gpu = cuda_info[gpu_no]
    print(f"GPU: {gpu['Name']}")
    print(f"GPU FLOPS: {gpu_flops(gpu):.2f} TFLOPS")
    tao = gpu_flops(gpu) # TFLOPS

#4. Estimating Model Parameters (P)
P = model.num_parameters()
print(f"Model Parameters: {P/1000**2:.2f}M")

#5. Estimating Dataset Size (D)
D = 0
for input_ids in tokenized_datasets["train"]["input_ids"]:
    D += len(input_ids)
print(f"Dataset Size: {D} tokens")

#6. Estimating Training Time (T)
# Calculate training time based on the formula tao*T = 6*P*D. Note that ao needs to be in FLOPS/s not TFLOPS/s, to convert TFLOPS/s to FLOPS/s multiply by 1000**3.
T = 6*P*D/(tao*1000**3)
print(f"Training Time: {T/3600:.2f} hours")


#7. Parameter vs Dataset Tradeoffs
# Although strictly speaking you can train a transformer for as many tokens as you like, the number of tokens trained can highly impact both the computing costs and the final model performance making striking the right balance important.
# compute optimal language model has a number of parameters and a dataset size that satisfies the approximation D = 20*P.
optimal_D = 20*P
print(f"Optimal Dataset Size: {optimal_D / 1000**2:.2f}M tokens ({optimal_D / D:.2f}x current dataset size)")





In [None]:
from cuda_utils import  main, list_common_gpus, custome_gpu_info
import json

available_gpus = list_common_gpus()
print(f"Available GPUs: {available_gpus}")


t4_cuda_info = custome_gpu_info('NVIDIA T4')
cuda_info = main(verbose=False)
print(json.dumps(t4_cuda_info,  indent=4))
print('------------------')
print(json.dumps(cuda_info,  indent=4))



In [None]:
import pandas as pd
df = pd.read_csv('llms_info.csv')



In [None]:
df

In [None]:
import csv

datasets = [
    ["Dataset", "Language", "Size", "Description"],
    ["CodeXGLUE", "Python", "10k samples", "Benchmark dataset for code intelligence"],
    ["HumanEval", "Python", "77k samples", "Diverse code samples for training foundations models"], 
    ["CodeParrot", "Python", "35 million samples", "Large dataset of Python functions for few-shot learning"],
    ["CodeForce", "C++", "435k problems", "Competitive programming challenges"],
    ["GitHub", "Various", "Billions of lines", "Open source code from public GitHub repositories"],
]

with open('code_datasets.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(datasets)

In [None]:
range_start_X = 10**6
range_end_X = 10**10
range_step_X = 5*10**6

x_vals = list(range(range_start_X, range_end_X, range_step_X))


In [None]:
len(x_vals)