In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import Any

In [None]:
MAX_LEN = 1024

def get_tokenizer(model_name: str, ACCESS_TOKEN: str | None = None) -> Any:

    if ACCESS_TOKEN:
        tokenizer = AutoTokenizer.from_pretrained( model_name,
                                                   trust_remote_code=True,
                                                   token=ACCESS_TOKEN, )
    else:
        tokenizer = AutoTokenizer.from_pretrained( model_name,
                                                   trust_remote_code=True, )
    tokenizer.add_special_tokens({"pad_token": "<pad>"})
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return tokenizer


def get_model(model_name: str, ACCESS_TOKEN: str | None = None) -> Any:
    ''' Return model depending on the model name.
        Note: * float16 - more precise but has narrow range,
              * bfloat16 less precision, but dynamic range closer to float32 -> useseful for DL
              * float32 doesn't have a bfloat32 equivalent as it is already sufficient.
              * model.eval() sets model to evaluation mode, which disables dropout,
                sets batch normalization layers to use their running statistics,
                and generally ensures more deterministic behavior
              
              TODO: test speed vs. accuracy for bloat16 vs. float32 (two separat runs)
    '''
    if ACCESS_TOKEN:
        model = AutoModelForCausalLM.from_pretrained( model_name,
                                                      device_map='cuda',
                                                      trust_remote_code=True,
                                                      torch_dtype='auto',
                                                      #torch_dtype=torch.float32,
                                                      #torch_dtype=torch.bfloat16,    # test vs. float32
                                                      
                                                      token = ACCESS_TOKEN, )
    else:
        model = AutoModelForCausalLM.from_pretrained( model_name,
                                                      device_map='cuda',
                                                      trust_remote_code=True,
                                                      torch_dtype='auto',
                                                      #torch_dtype=torch.float32,
                                                      #torch_dtype=torch.bfloat16,    # test vs. float32
                                                      
                                                     )
    model.eval()
    return model



def get_model_old_DISREGARD(model_name: str, ACCESS_TOKEN: str) -> Any:
    ''' Return model depending on the model name.
        Note: * float16 - more precise but has narrow range,
              * bfloat16 less precision, but dynamic range closer to float32 -> useseful for DL
              * float32 doesn't have a bfloat32 equivalent as it is already sufficient.
              * model.eval() sets model to evaluation mode, which disables dropout,
                sets batch normalization layers to use their running statistics,
                and generally ensures more deterministic behavior
              
              TODO: test speed vs. accuracy for bloat16 vs. float32 (two separat runs)
    '''
    
    if model in [ 'meta-llama/Meta-Llama-3.1-8B-Instruct', ]:
        model = AutoModelForCausalLM.from_pretrained( model_name,
                                                      device_map='cuda',
                                                      token=ACCESS_TOKEN, )
    
    elif model_name in [ 'google/codegemma-7b-it', ]:  # SELECTING THIS ONE ASONE GENERAL WAY
        model = AutoModelForCausalLM.from_pretrained( model_name,
                                                      device_map='cuda',
                                                      trust_remote_code=True,
                                                      torch_dtype=torch.bfloat16,    # test vs. float32
                                                      #torch_dtype=torch.float32, 
                                                      #torch_dtype='auto',
                                                      token = ACCESS_TOKEN, )
        
    # originally, only OpenCodeInterpreter used model.eval()    
    elif model_name in [ 'deepseek-ai/deepseek-coder-6.7b-instruct', 'm-a-p/OpenCodeInterpreter-DS-6.7B', ]:
        model = AutoModelForCausalLM.from_pretrained( model_name,
                                                      device_map='cuda',        
                                                      torch_dtype=torch.bfloat16,     # try w/out it or float32?
                                                      trust_remote_code=True, )
        # originally this was from_pretrained(...).cuda() - less efficient
        
    elif model_name in [ 'Artigenz/Artigenz-Coder-DS-6.7B',
                         'Qwen/CodeQwen1.5-7B-Chat',
                         'NTQAI/Nxcode-CQ-7B-orpo',
                       ]:
        model = AutoModelForCausalLM.from_pretrained( model_name,
                                                      device_map='cuda',
                                                      torch_dtype='auto', )    # pick one consistent type
        
    
        
    model.eval()
    return model
        
print(f'Model device: {model.device}')

In [None]:
model_name = "deepseek-ai/deepseek-coder-6.7b-instruct"

def generate_response5(prompt):
    messages=[
      { 'role': 'user', 'content': prompt }
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(model.device)
    outputs = model.generate(
        inputs,
        max_new_tokens=512,
        do_sample=False,
        top_k=50,
        top_p=0.95,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
    res = tokenizer.decode( outputs[0][len(inputs[0]):], skip_special_tokens=True, )
    return res


In [None]:
model_name = 'Artigenz/Artigenz-Coder-DS-6.7B'

def generate_response(prompt):
    messages = [
      {"role": "user", "content": prompt}
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(model.device)
    outputs = model.generate(
        inputs,
        max_new_tokens=512,
        do_sample=False,
        top_k=50,
        top_p=0.95,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
    res = tokenizer.decode( outputs[0][len(inputs[0]):], skip_special_tokens=True, )
    return res

In [None]:
model_name = 'Qwen/CodeQwen1.5-7B-Chat'

def generate_response4(prompt):
    messages = [
      {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True,
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
    generated_ids = model.generate(
      model_inputs.input_ids,
      max_new_tokens=512,
    )
    generated_ids = [
      output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    res = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return res

In [None]:
# chat template
model_name = 'google/codegemma-7b-it'

def generate_response3(prompt):
    ''' Chat template '''
    chat = [
      { "role": "user", "content": prompt },
    ]
    prompt_for_chat = tokenizer.apply_chat_template(
        chat,
        tokenize=False,
        add_generation_prompt=True,
    )
    inputs  = tokenizer.encode(
        prompt_for_chat,
        add_special_tokens=False,
        return_tensors="pt",
    )
    outputs = model.generate( input_ids=inputs.to(model.device), max_new_tokens=512, )
    res = tokenizer.decode( outputs[0][len(inputs[0]):], skip_special_tokens=True, )
    return res

In [None]:
model_name = 'm-a-p/OpenCodeInterpreter-DS-6.7B'

def generate_response2(prompt):
    ''' Outputs: "A: The capital of California is Sacramento." with Phixtral '''
    inputs = tokenizer.apply_chat_template(
        [{'role': 'user', 'content': prompt }],
        return_tensors="pt"
    ).to(model.device)
    outputs = model.generate(
      inputs,
      max_new_tokens=1024,
      do_sample=False,
      pad_token_id=tokenizer.eos_token_id,
      eos_token_id=tokenizer.eos_token_id,
    )
    res = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
    return res

In [None]:
model_name = 'NTQAI/Nxcode-CQ-7B-orpo'

def generate_response(prompt):
    messages = [
      {"role": "user", "content": prompt}
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",).to(model.device)
    outputs = model.generate(
        inputs,
        max_new_tokens=512,
        do_sample=False,
        top_k=50,
        top_p=0.95,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
    res = tokenizer.decode( outputs[0][len(inputs[0]):], skip_special_tokens=True )
    return res

In [None]:
model_name = 'meta-llama/Meta-Llama-3.1-8B-Instruct'    
    
# This code is also used by Artigenz, Deepseek Coder, Nxcode-CQ-7B-orpo (works for Llama 3.1 based on trial and error experiments with other similar funcs)
def generate_response(prompt):
    messages = [
      {"role": "user", "content": prompt}
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(model.device)
    outputs = model.generate(
        inputs, max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=1.0,
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    res = tokenizer.decode( outputs[0][len(inputs[0]):], skip_special_tokens=True )
    return res

In [None]:
model_name = "mlabonne/phixtral-2x2_8"

def generate_response5(prompt):
        
    torch.set_default_device("cuda")
        
    inputs = tokenizer(
        prompt, 
        padding=True,        # Pad sequences to the same length
        truncation=True,
        return_tensors="pt", 
        return_attention_mask=False
    )                                  # need to add .to(model.device) and remove torch.set_default('cuda')
        
    outputs = model.generate(**inputs, max_length=MAX_LEN)    
    text = tokenizer.batch_decode(outputs)[0]
    
    return text

In [None]:
model_name = "mlabonne/phixtral-4x2_8"
torch.set_default_device("cuda")


def generate_response5(prompt):
        
    torch.set_default_device("cuda")
        
    inputs = tokenizer(
        prompt, 
        return_tensors="pt", 
        return_attention_mask=False
    )                                  # need to add .to(model.device) and remove torch.set_default('cuda')
        
    outputs = model.generate(**inputs, max_length=MAX_LEN)    
    text = tokenizer.batch_decode(outputs)[0]
    
    return text

In [None]:
# chat template - should be similar to LLama!
model_name = 'NousResearch/Nous-Hermes-2-SOLAR-10.7B'

tokenizer  = AutoTokenizer.from_pretrained(model_name)

model      = AutoModelForCausalLM.from_pretrained(model_name)

messages = [
    {"role": "system", "content": "You are Hermes 2."},
    {"role": "user", "content": prompt}
]
gen_input = tokenizer.apply_chat_template(message, return_tensors="pt")
model.generate(**gen_input)

# APPENDIX

## Each generate_response() function variant was tested on all models

In [None]:
def generate_response(prompt):
    ''' Phixtral 2x2_8 & 4x2_8 & solar output: The capital of California is Sacramento.
        deepseek-coder doesn't know geography (capital of Ca)
        Nxcode, CodeQwen, Artigenz, all others - correct output
    '''
    messages=[
      { 'role': 'user', 'content': prompt }
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(model.device)
    outputs = model.generate(
        inputs,
        max_new_tokens=MAX_LEN,
        do_sample=False,
        temperature=1.0,
        top_p=1.0,
        #top_k=50,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    res = tokenizer.decode( outputs[0][len(inputs[0]):], skip_special_tokens=True, )
    return res

def generate_response2(prompt):
    ''' Phixtral 2x2_8 & 4x2_8 outputs: "A: The capital of California is Sacramento."
        Solar & deepseek-coder output empty string
        Llama 3.1 says: "assistant\n\nThe capital of California is Sacramento."
        Gemma bloat16 & OpenCodeIntgerpreter output correctly: "The capital of California is Sacramento."
        Gemma float32: incorrect output: **Sacramento**
        Artigenz - correct output
        Nxcode, CodeQwen - '\nThe capital of California is Sacramento'
    '''
    inputs = tokenizer.apply_chat_template(
        [{'role': 'user', 'content': prompt }],
        return_tensors="pt"
    ).to(model.device)
    outputs = model.generate(
      inputs,
      max_new_tokens=MAX_LEN,
      do_sample=False,
      pad_token_id=tokenizer.eos_token_id,
      eos_token_id=tokenizer.eos_token_id,
    )
    res = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
    return res

def generate_response3(prompt):
    ''' Phixtral 2x2_8 & 4x2_8 outputs non-chat blah blah blah output (takes much longer than chat)
        Solar & OpenCodeIntgerpreter      outputs the correct answer, but then a bunch of '\n' end of line symbols - still wrong
        Llama & Gemma outputs correctly: "The capital of California is Sacramento."
        deepseek-coder can't answer geographic questions (but code works)
        Nxcode, CodeQwen, Artigenz - correct output
    '''

    chat = [
      { "role": "user", "content": prompt },
    ]
    prompt_for_chat = tokenizer.apply_chat_template(
        chat,
        tokenize=False,
        add_generation_prompt=True,
    )
    inputs  = tokenizer.encode(
        prompt_for_chat,
        add_special_tokens=False,
        return_tensors="pt",
    )
    outputs = model.generate( input_ids=inputs.to(model.device), max_new_tokens=512, )
    res = tokenizer.decode( outputs[0][len(inputs[0]):], skip_special_tokens=True, )
    return res


def generate_response4(prompt):
    ''' Phixtral 2x2_8 & 4x2_8 outputs non-chat blah blah blah output (takes much longer than chat)
        Solar outputs the correct answer, but then a bunch of '\n' end of line symbols - still wrong
        Llama & OpenCodeInterpreter output correctly: "The capital of California is Sacramento."
        deepseek-coder can't answer geographic questions (but code works)
        Artigenz, Gemma - error: system role not supported
        Nxcode, CodeQwen - correct output
    '''
    messages = [
      {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True,
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    generated_ids = model.generate(
      model_inputs.input_ids,
      max_new_tokens=MAX_LEN,
    )
    generated_ids = [
      output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    res = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return res


def generate_response5(prompt):
    ''' Came with the code for Phixtral.
        Phixtral 2x2_8 & 4x2_8 & Solar outputs non-chat blah blah blah output (takes much longer than chat)
        Llama & Gemma & deepseek-coder, OpenCode-Interpreter output non-chat lengthy output
        Artigenz - Less of a nonsense, but still nonsense
        Nxcode, CodeQwen - error: token_type_ids not used by the model
    '''
    inputs = tokenizer(
        prompt,
        padding=True,        # Pad sequences to the same length
        truncation=True,
        return_tensors="pt",
        return_attention_mask=False
    ).to(model.device)                                  # need to add .to(model.device) and remove torch.set_default('cuda')

    outputs = model.generate(**inputs, max_length=MAX_LEN)
    text = tokenizer.batch_decode(outputs)[0]
    return text