In [1]:
# mistralai/Mistral-7B-Instruct-v0.2

In [2]:
import torch

torch.cuda.is_available()

True

In [3]:
torch.cuda.get_device_name(0)

'NVIDIA RTX A3000 12GB Laptop GPU'

In [4]:
from dotenv import load_dotenv, find_dotenv
import os

_ = load_dotenv()

print(os.environ.get("DISABLE_QIGEN"))

1


In [5]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()


#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')


Using device: cuda

NVIDIA RTX A3000 12GB Laptop GPU
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [6]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:41:10_Pacific_Daylight_Time_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [7]:
import torch

print(torch.cuda.get_device_name(0))
print(torch.cuda.is_available())

NVIDIA RTX A3000 12GB Laptop GPU
True


In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer

#model_id = "yunconglong/Truthful_DPO_TomGrc_FusionNet_7Bx2_MoE_13B" # 129s 50 token
#model_id = "mistralai/Mistral-7B-Instruct-v0.2" # 25s 50 token
#model_id = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ" #2.5s 50 token
model_id = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ" #33s 50 token
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

In [9]:
tokenizer.model_max_length

1000000000000000019884624838656

In [10]:
"yunconglong/Truthful_DPO_TomGrc_FusionNet_7Bx2_MoE_13B" # 129s 50 token
"mistralai/Mistral-7B-Instruct-v0.2" # 25s 50 token
"TheBloke/Mistral-7B-Instruct-v0.2-GPTQ" #2.5s 50 token
"TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ" #33s 50 token

'TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ'

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda:0",
    trust_remote_code=False,
    revision="main",
    #torch_dtype=torch.float16,
)

In [12]:
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

In [13]:
import time

start_time = time.time()

text = "Tell me about Venezuela"
prompt_template=f'''[INST] {text} [/INST]'''
inputs = tokenizer(prompt_template, return_tensors="pt").input_ids.to(0)
outputs = model.generate(inputs, max_new_tokens=50, do_sample=True, temperature=0.01)

print(f"Time: {time.time() - start_time}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Time: 35.67814636230469


In [13]:
inputs.size()[1]

12

In [14]:
print(tokenizer.decode(outputs[0][inputs.size()[1]:], skip_special_tokens=True))

Venezuela is a country located in the northern part of South America. It is the westernmost country of the South American region, and it is bordered by Colombia to the west, Brazil to the south and the Atlantic Ocean to the north. The capital city is Caracas.

Venezuela has a rich cultural heritage, with indigenous, European, African, and Asian influences. The country is known for its natural beauty, with diverse landscapes ranging from the Andes Mountains in the west to the Amazon Rainforest in the south, and the Caribbean Sea coastline in the north.

Venezuela is the world's largest holder of oil reserves, and its economy has historically been heavily dependent on the oil industry. However, the country has faced significant economic challenges in recent years, including hyperinflation, food shortages, and a decline in living standards for many of its people.

Politically, Venezuela has a presidential representative democratic republic, with the President of Venezuela serving as both 

In [ ]:
from typing import Generator
from config import config

##this function can eventually be used to serve gptq models
def get_completion_gptq(self, prompt) -> Generator[str, None, None]:
    model_id = self.GPTQ_MODEL_FILE_MAPPING[self.model.value]
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    self.gptq_model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        trust_remote_code=False,
        revision="main",
    )

    start_time = time.time()
    prompt_template = f'''{self.priming} \n\n {prompt}'''
    inputs = tokenizer(prompt_template, return_tensors="pt").input_ids
    outputs = self.gptq_model.generate(inputs, max_new_tokens=config.prompt_size, do_sample=True,
                                       temperature=config.temperature)
    print(f"Time: {time.time() - start_time}")
    yield tokenizer.decode(outputs[0][inputs.size()[1]:], skip_special_tokens=True)