In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import Any

In [6]:
def function1(arg):
    print("Function 1 called with:", arg)

def function2(arg):
    print("Function 2 called with:", arg)

def function3(arg):
    print("Function 3 called with:", arg)

function_map = {
    "func1": function1,
    "func2": function2,
    "func3": function3,
}

def call_function_by_reference(function_name, arg):
    func = function_map.get(function_name)
    if func:
        func(arg)
    else:
        print("Invalid function name.")
#Example Usage
call_function_by_reference("func1", "hello")
call_function_by_reference("func2", 123)
call_function_by_reference("invalid_func", "test")

Function 1 called with: hello
Function 2 called with: 123
Invalid function name.


In [None]:
def get_tokenizer(model_name: str, ACCESS_TOKEN: str) -> Any:
    ''' Return tokenizer depending on the model name '''
    if model_name in [ 'google/codegemma-7b-it', 'meta-llama/Meta-Llama-3.1-8B-Instruct', ]:
        tokenizer = AutoTokenizer.from_pretrained( model_name,
                                                   trust_remote_code=True,
                                                   token=ACCESS_TOKEN, )
    else:
        tokenizer = AutoTokenizer.from_pretrained( model_name,
                                                   trust_remote_code=True, )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return tokenizer


def get_model(model_name: str, ACCESS_TOKEN: str) -> Any:
    ''' Return model depending on the model name.
        Note: * float16 - more precise but has narrow range,
              * bfloat16 less precision, but dynamic range closer to float32 -> useseful for DL
              * float32 doesn't have a bfloat32 equivalent as it is already sufficient.
              * model.eval() sets model to evaluation mode, which disables dropout,
                sets batch normalization layers to use their running statistics,
                and generally ensures more deterministic behavior
              
              TODO: test speed vs. accuracy for bloat16 vs. float32 (two separat runs)
    '''
    
    if model in [ 'meta-llama/Meta-Llama-3.1-8B-Instruct', ]:
        model = AutoModelForCausalLM.from_pretrained( model_name,
                                                      token=ACCESS_TOKEN, )
    
    elif model_name in [ 'google/codegemma-7b-it', ]:
        model = AutoModelForCausalLM.from_pretrained( model_name,
                                                      device_map='cuda',
                                                      torch_dtype=torch.bfloat16,    # try w/out it?
                                                      token = ACCESS_TOKEN, )
        
    # originally, only OpenCodeInterpreter used model.eval()    
    elif model_name in [ 'deepseek-ai/deepseek-coder-6.7b-instruct', 'm-a-p/OpenCodeInterpreter-DS-6.7B', ]:
        model = AutoModelForCausalLM.from_pretrained( model_name,
                                                      device_map='cuda',        
                                                      torch_dtype=torch.bfloat16,     # try w/out it?
                                                      trust_remote_code=True, )
        # originally this was from_pretrained(...).cuda() - less efficient
        
    elif model_name in [ 'Artigenz/Artigenz-Coder-DS-6.7B',
                         'Qwen/CodeQwen1.5-7B-Chat',
                         'NTQAI/Nxcode-CQ-7B-orpo',
                       ]:
        model = AutoModelForCausalLM.from_pretrained( model_name,
                                                      device_map='cuda',
                                                      torch_dtype='auto', )    # pick one consistent type
        
    
        
    model.eval()    # using model for reference only (see docstring above)
    return model
        
print(f'Model device: {model.device}')

In [None]:
model_name = "deepseek-ai/deepseek-coder-6.7b-instruct"

def generate_response(prompt):
    messages=[
      { 'role': 'user', 'content': prompt }
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(model.device)
    outputs = model.generate(
        inputs,
        max_new_tokens=512,
        do_sample=False,
        top_k=50,
        top_p=0.95,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
    res = tokenizer.decode( outputs[0][len(inputs[0]):], skip_special_tokens=True, )
    return res


In [None]:
model_name = 'Artigenz/Artigenz-Coder-DS-6.7B'

def generate_response(prompt):
    messages = [
      {"role": "user", "content": prompt}
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(model.device)
    outputs = model.generate(
        inputs,
        max_new_tokens=512,
        do_sample=False,
        top_k=50,
        top_p=0.95,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
    res = tokenizer.decode( outputs[0][len(inputs[0]):], skip_special_tokens=True, )
    return res

In [None]:
model_name = 'Qwen/CodeQwen1.5-7B-Chat'

def generate_response(prompt):
    messages = [
      {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True,
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
    generated_ids = model.generate(
      model_inputs.input_ids,
      max_new_tokens=512,
    )
    generated_ids = [
      output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    res = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return res

In [None]:
# chat template
model_name = 'google/codegemma-7b-it'

def generate_response(prompt):
    ''' Chat template '''
    chat = [
      { "role": "user", "content": prompt },
    ]
    prompt_for_chat = tokenizer.apply_chat_template(
        chat,
        tokenize=False,
        add_generation_prompt=True,
    )
    inputs  = tokenizer.encode(
        prompt_for_chat,
        add_special_tokens=False,
        return_tensors="pt",
    )
    outputs = model.generate( input_ids=inputs.to(model.device), max_new_tokens=512, )
    res = tokenizer.decode( outputs[0][len(inputs[0]):], skip_special_tokens=True, )
    return res

In [None]:
model_name = 'm-a-p/OpenCodeInterpreter-DS-6.7B'

def generate_response(prompt):
    inputs = tokenizer.apply_chat_template(
        [{'role': 'user', 'content': prompt }],
        return_tensors="pt"
    ).to(model.device)
    outputs = model.generate(
      inputs,
      max_new_tokens=1024,
      do_sample=False,
      pad_token_id=tokenizer.eos_token_id,
      eos_token_id=tokenizer.eos_token_id,
    )
    res = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
    return res

In [None]:
model_name = 'NTQAI/Nxcode-CQ-7B-orpo'

def generate_response(prompt):
    messages = [
      {"role": "user", "content": prompt}
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",).to(model.device)
    outputs = model.generate(
        inputs,
        max_new_tokens=512,
        do_sample=False,
        top_k=50,
        top_p=0.95,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
    res = tokenizer.decode( outputs[0][len(inputs[0]):], skip_special_tokens=True )
    return res

In [None]:
model_name = 'meta-llama/Meta-Llama-3.1-8B-Instruct'    
    
# This code is also used by Artigenz, Deepseek Coder, Nxcode-CQ-7B-orpo (works for Llama 3.1 based on trial and error experiments with other similar funcs)
def generate_response(prompt):
    messages = [
      {"role": "user", "content": prompt}
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(model.device)
    outputs = model.generate(
        inputs, max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=1.0,
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    res = tokenizer.decode( outputs[0][len(inputs[0]):], skip_special_tokens=True )
    return res

In [None]:
model_name = "mlabonne/phixtral-2x2_8"

instruction = '''
    def print_prime(n):
        """
        Print all primes between 1 and n
        """
'''

torch.set_default_device("cuda")

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype="auto", 
    load_in_4bit=True, 
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True
)

# Tokenize the input string
inputs = tokenizer(
    instruction, 
    return_tensors="pt", 
    return_attention_mask=False
)

# Generate text using the model
outputs = model.generate(**inputs, max_length=200)

# Decode and print the output
text = tokenizer.batch_decode(outputs)[0]
print(text)

In [None]:
model_name = "mlabonne/phixtral-4x2_8"
torch.set_default_device("cuda")

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype="auto", 
    load_in_4bit=True, 
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True
)

# Tokenize the input string
inputs = tokenizer(
    prompt, 
    return_tensors="pt", 
    return_attention_mask=False
)

# Generate text using the model
outputs = model.generate(**inputs, max_length=200)

# Decode and print the output
text = tokenizer.batch_decode(outputs)[0]
print(text)

In [None]:
# chat template - should be similar to LLama!
model_name = 'NousResearch/Nous-Hermes-2-SOLAR-10.7B'

tokenizer  = AutoTokenizer.from_pretrained(model_name)

model      = AutoModelForCausalLM.from_pretrained(model_name)

messages = [
    {"role": "system", "content": "You are Hermes 2."},
    {"role": "user", "content": prompt}
]
gen_input = tokenizer.apply_chat_template(message, return_tensors="pt")
model.generate(**gen_input)