In [1]:
from llama_cpp import Llama
from pathlib import Path
p = Path.cwd()
path_beginning = str(p.home())+'/PycharmProjects/BIGAI/'
path = path_beginning+"MODELS/TEXT/"
llama2 = path+'llama-2-7b-chat.Q4_K_M.gguf'
llama3 = path+'Meta-Llama-3-8B-Instruct.Q4_K_M.gguf'

In [2]:
llama2 = Llama(model_path=llama2,n_gpu_layers=-1,n_ctx=1024,echo=False)
llama3 = Llama(model_path=llama3,n_gpu_layers=-1,n_ctx=1024,echo=False)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /home/bigai/PycharmProjects/BIGAI/MODELS/TEXT/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 

In [3]:
!nvidia-smi

Tue Apr 30 13:54:55 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.78                 Driver Version: 550.78         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off |   00000000:01:00.0  On |                  Off |
|  0%   49C    P2             66W /  450W |    6654MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA GeForce RTX 4060 Ti     Off |   00

In [4]:
def llm_make_move(model: Llama, prompt: str) -> str:
    """ Call a model with a prompt """
    res = model(prompt, stream=False, max_tokens=1024, temperature=0.8)
    return res["choices"][0]["text"]

In [6]:
from typing import List
board = [["E", "E", "E"],
         ["E", "E", "E"],
         ["E", "E", "E"]]

def board_to_string(board_data: List) -> str:
    """ Convert board to the string representation """
    return "\n".join([" ".join(x) for x in board_data])

In [7]:
sys_prompt1 = """You play a tic-tac-toe game. You make a move by placing X, 
                 your opponent plays by placing O. Empty cells are marked 
                 with E. You can place X only to the empty cell."""
sys_prompt2 = """You play a tic-tac-toe game. You make a move by placing O, 
                 your opponent plays by placing X. Empty cells are marked 
                 with E. You can place O only to the empty cell."""
game_prompt = """What is your next move? Think in steps. 
                 Each row and column should be in range 1..3. Write
                 the answer in JSON as {"ROW": ROW, "COLUMN": COLUMN}."""

In [8]:
template_llama2 = f"""<s>[INST]<<SYS>>{sys_prompt1}<</SYS>>
Here is the board image:
__BOARD__\n
{game_prompt}
[/INST]"""

template_llama3 = f"""<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>{sys_prompt2}<|eot_id|>
<|start_header_id|>user<|end_header_id|>
Here is the board image:
__BOARD__\n
{game_prompt}
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>"""

In [9]:
def make_prompt_llama2(board: List) -> str:
    """ Make Llama-2 prompt """
    return template_llama2.replace("__BOARD__", board_to_string(board))


def make_prompt_llama3(board: List) -> str:
    """ Make Llama-3 prompt """
    return template_llama3.replace("__BOARD__", board_to_string(board))

In [11]:
from typing import Optional
def extract_json(response: str) -> Optional[dict]:
    """ Extract dictionary from a response string """
    try:
        # Models sometimes to a mistake, fix: {ROW: 1, COLUMN: 2} => {"ROW": 1, "COLUMN": 2}
        response = response.replace('ROW:', '"ROW":').replace('COLUMN:', '"COLUMN":')
        # Extract json from a response
        pos_end = response.rfind("}")
        pos_start = response.rfind("{")
        return json.loads(response[pos_start:pos_end+1])
    except Exception as exp:
        print(f"extract_json::cannot parse output: {exp}")
    return None

In [12]:
def make_move(board_data: List, move: Optional[dict], symb: str):
    """ Update board with a new symbol """
    row, col = int(move["ROW"]), int(move["COLUMN"])
    if 1 <= row <= 3 and 1 <= col <= 3:
        if board_data[row - 1][col - 1] == "E":
            board_data[row - 1][col - 1] = symb
        else:
            print(f"Wrong move: cell {row}:{col} is not empty")
    else:
        print("Wrong move: incorrect index")

In [13]:
def check_for_end_game(board_data: List) -> bool:
    """ Check if there are no empty cells available """
    return board_to_string(board_data).find("E") == -1

def check_for_win(board_data: List) -> bool:
    """ Check if the game is over """
    # Check Horizontal and Vertical lines
    for ind in range(3):
        if board_data[ind][0] == board_data[ind][1] == board_data[ind][2] and board_data[ind][0] != "E":
            print(f"{board_data[ind][0]} win!")
            return True
        if board_data[0][ind] == board_data[1][ind] == board_data[2][ind] and board_data[0][ind] != "E":
            print(f"{board_data[0][ind]} win!")
            return True
    # Check Diagonals
    if board_data[0][0] == board_data[1][1] == board_data[2][2] and board_data[1][1] != "E" or \
       board_data[2][0] == board_data[1][1] == board_data[0][2] and board_data[1][1] != "E":
        print(f"{board_data[1][1]} win!")
        return True
    return False

In [21]:
import time
import json
num_wins1, num_wins2 = 0, 0
times_1, times_2 = [], []


def run_game():
    """ Run a game between two models """
    board = [["E", "E", "E"],
             ["E", "E", "E"],
             ["E", "E", "E"]]
    
    moves_limit = 20
    for step in range(moves_limit):
        print(f"Step {step+1}")
    
        # Move: Model-1
        t_start = time.monotonic()
        prompt = make_prompt_llama2(board)
        result_str = llm_make_move(llama2, prompt)
        times_1.append(time.monotonic() - t_start)
    
        new_data = extract_json(result_str)
        if new_data is not None:
            make_move(board, new_data, symb="X")
            if check_for_win(board):
                print('**Model 1 Won**')
                num_wins1 += 1
                break
            if check_for_end_game(board):
                break
    
        # Move: Model-2
        t_start = time.monotonic()
        prompt = make_prompt_llama3(board)
        result_str = llm_make_move(llama3, prompt)
        times_2.append(time.monotonic() - t_start)
    
        new_data = extract_json(result_str)
        if new_data is not None:
            make_move(board, new_data, symb="O")
            if check_for_win(board):
                print('**Model 2 Won**')
                num_wins2 += 1
                break
            if check_for_end_game(board):
                break
        print()

In [22]:
run_game()

Step 1


Llama.generate: prefix-match hit

llama_print_timings:        load time =     212.76 ms
llama_print_timings:      sample time =      18.70 ms /    44 runs   (    0.43 ms per token,  2352.82 tokens per second)
llama_print_timings: prompt eval time =      77.00 ms /    65 tokens (    1.18 ms per token,   844.19 tokens per second)
llama_print_timings:        eval time =     501.05 ms /    43 runs   (   11.65 ms per token,    85.82 tokens per second)
llama_print_timings:       total time =     678.64 ms /   108 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =      85.59 ms
llama_print_timings:      sample time =      61.09 ms /    37 runs   (    1.65 ms per token,   605.63 tokens per second)
llama_print_timings: prompt eval time =      70.24 ms /    61 tokens (    1.15 ms per token,   868.43 tokens per second)
llama_print_timings:        eval time =     485.09 ms /    36 runs   (   13.47 ms per token,    74.21 tokens per second)
llama_print_timings:       to


Step 2



llama_print_timings:        load time =     212.76 ms
llama_print_timings:      sample time =      49.03 ms /   119 runs   (    0.41 ms per token,  2427.13 tokens per second)
llama_print_timings: prompt eval time =      61.74 ms /    64 tokens (    0.96 ms per token,  1036.54 tokens per second)
llama_print_timings:        eval time =    1370.22 ms /   118 runs   (   11.61 ms per token,    86.12 tokens per second)
llama_print_timings:       total time =    1705.74 ms /   182 tokens
Llama.generate: prefix-match hit


Wrong move: cell 2:1 is not empty



llama_print_timings:        load time =      85.59 ms
llama_print_timings:      sample time =     232.31 ms /   138 runs   (    1.68 ms per token,   594.03 tokens per second)
llama_print_timings: prompt eval time =      70.27 ms /    57 tokens (    1.23 ms per token,   811.15 tokens per second)
llama_print_timings:        eval time =    1837.15 ms /   137 runs   (   13.41 ms per token,    74.57 tokens per second)
llama_print_timings:       total time =    3105.38 ms /   194 tokens
Llama.generate: prefix-match hit


Wrong move: cell 2:1 is not empty

Step 3



llama_print_timings:        load time =     212.76 ms
llama_print_timings:      sample time =      49.55 ms /   118 runs   (    0.42 ms per token,  2381.67 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    1367.85 ms /   118 runs   (   11.59 ms per token,    86.27 tokens per second)
llama_print_timings:       total time =    1640.94 ms /   119 tokens
Llama.generate: prefix-match hit


extract_json::cannot parse output: Expecting value: line 1 column 1 (char 0)



llama_print_timings:        load time =      85.59 ms
llama_print_timings:      sample time =     104.22 ms /    62 runs   (    1.68 ms per token,   594.88 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     833.42 ms /    62 runs   (   13.44 ms per token,    74.39 tokens per second)
llama_print_timings:       total time =    1372.92 ms /    63 tokens
Llama.generate: prefix-match hit



Step 4



llama_print_timings:        load time =     212.76 ms
llama_print_timings:      sample time =      30.95 ms /    72 runs   (    0.43 ms per token,  2326.56 tokens per second)
llama_print_timings: prompt eval time =      60.46 ms /    60 tokens (    1.01 ms per token,   992.41 tokens per second)
llama_print_timings:        eval time =     823.26 ms /    71 runs   (   11.60 ms per token,    86.24 tokens per second)
llama_print_timings:       total time =    1051.15 ms /   131 tokens
Llama.generate: prefix-match hit


Wrong move: cell 1:2 is not empty



llama_print_timings:        load time =      85.59 ms
llama_print_timings:      sample time =     120.90 ms /    73 runs   (    1.66 ms per token,   603.80 tokens per second)
llama_print_timings: prompt eval time =      70.08 ms /    56 tokens (    1.25 ms per token,   799.11 tokens per second)
llama_print_timings:        eval time =     964.13 ms /    72 runs   (   13.39 ms per token,    74.68 tokens per second)
llama_print_timings:       total time =    1666.07 ms /   128 tokens


O win!
**Model 2 Won**


UnboundLocalError: cannot access local variable 'num_wins2' where it is not associated with a value