In [1]:
%cd ~/spikes/sudoku
from copy import deepcopy
import json
import os
from getpass import getpass
from IPython.display import clear_output
from langchain.callbacks import get_openai_callback

import sudoku.display
import sudoku.validation
import sudoku.construction
from sudoku.display import display_sudoku, display_sudoku_comparison, display_outcomes, display_games
from sudoku.validation import is_proposed_solution_valid, is_single_cell_proposed, analysis_to_puzzle_solution
from sudoku.construction import construct_puzzle_solution, pluck, weighted_random_choice, make_puzzle

from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, AIMessage, SystemMessage

  bkms = self.shell.db.get('bookmarks', {})
  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


/Users/allanniemerg/spikes/sudoku


In [2]:
# setting up an OpenAI template on the run
OPENAI_API_KEY = getpass()

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

 ········


#### Create an Evaluation Set 

In [14]:
eval_set = []
for i in range(100):
    solution = construct_puzzle_solution()
    _, _, history = pluck(deepcopy(solution))
    puzzle = weighted_random_choice(history[1:])
    eval_set.append((puzzle, solution))

In [17]:
eval_set[0:3]

[([[4, 3, 1, 2], [0, 1, 3, 4], [1, 4, 2, 0], [3, 2, 4, 1]],
  [[4, 3, 1, 2], [2, 1, 3, 4], [1, 4, 2, 3], [3, 2, 4, 1]]),
 ([[3, 4, 1, 2], [0, 0, 3, 4], [4, 1, 0, 3], [0, 0, 0, 1]],
  [[3, 4, 1, 2], [1, 2, 3, 4], [4, 1, 2, 3], [2, 3, 4, 1]]),
 ([[2, 3, 1, 4], [4, 0, 0, 2], [3, 4, 2, 1], [1, 2, 0, 3]],
  [[2, 3, 1, 4], [4, 1, 3, 2], [3, 4, 2, 1], [1, 2, 4, 3]])]

In [20]:
json_string = json.dumps({'eval':eval_set})

# Write to file
with open('./data/evalset', 'w') as file:
    file.write(json_string)

### Test GPT-3.5-turbo

In [14]:
with open('./data/evalset', 'r') as file:
    content = file.read()
eval_set_loaded = json.loads(content)['eval']

In [6]:
brief_analysis = '''We are working on the following sudoku puzzle (each sub-list represents a row):
{}

You are a sudoku tutor. Create a brief analysis that finds an unsolved cell and solves it. 
Do not repeat the puzzle (which the student has seen). Just solve one cell that currently has a zero.
I suggest you start by examining which rows, columns, or regions have the most cells 
already solved. You can use this to identify one or more cells that are not currently solved but may be 
solvable from the available information. Then identify the solution to that cell. 

Your analysis must then solve ONLY one cell by replacing 0 with the correct number. Please don't include the 
puzzle in your analysis, we will provide that to the student seperately. 

Example puzzle: 
[[0, 0, 0, 0], [0, 0, 3, 2], [1, 0, 0, 0], [2, 0, 1, 4]]

Your analysis could look like this:
The row with the most solved cells is row 4 with numbers: 1 2 and 4. Because each row must contain the digits 1-4, 
the unsolved cell must be 3. Therefore row 4, column 2 is the number 3.'''

In [16]:
# Model to test
model_name = "gpt-3.5-turbo-1106"
model = ChatOpenAI(model=model_name)
model.temperature = 0.0
#Extraction Model
extraction_model = ChatOpenAI(model="gpt-4-1106-preview")

<IPython.core.display.Javascript object>

In [21]:
outcomes = []

with get_openai_callback() as cb:
    for sample in eval_set_loaded:   
        # Get puzzle and solution
        puzzle = sample[0]
        solution = sample[1]
        prompt = brief_analysis.format(puzzle)
        message = model.invoke([   
            HumanMessage(content=prompt)
        ])
        reasoning = message.content
        try:
            proposed = analysis_to_puzzle_solution(extraction_model, puzzle, reasoning)
            result = "correct" if is_proposed_solution_valid(puzzle, solution, proposed) else "incorrect"
        except:
            proposed = puzzle
            result = 'incorrect'
            reasoning += "The model failed to produce a parseable response."
    
    
        outcomes.append(({'result': result, 'proposed': proposed, 'puzzle': puzzle, 'reasoning': reasoning}))   
        clear_output(wait=True)
        display_outcomes(outcomes)
    print(cb)

Tokens Used: 98848
	Prompt Tokens: 87874
	Completion Tokens: 10974
Successful Requests: 200
Total Cost (USD): $1.9882680000000004


In [24]:
json_string = json.dumps({'outcomes':outcomes})

# Write to file
with open('./data/gpt-3_5_outcomes', 'w') as file:
    file.write(json_string)

### Test GPT-4 Turbo

In [25]:
with open('./data/evalset', 'r') as file:
    content = file.read()
eval_set_loaded = json.loads(content)['eval']

In [26]:
# Model to test
model_name = "gpt-4-1106-preview"
model = ChatOpenAI(model=model_name)
model.temperature = 0.0

#Extraction Model
extraction_model = ChatOpenAI(model="gpt-4-1106-preview")

In [27]:
outcomes = []

with get_openai_callback() as cb:
    for sample in eval_set_loaded:   
        # Get puzzle and solution
        puzzle = sample[0]
        solution = sample[1]
        prompt = brief_analysis.format(puzzle)
        message = model.invoke([   
            HumanMessage(content=prompt)
        ])
        reasoning = message.content
        try:
            proposed = analysis_to_puzzle_solution(extraction_model, puzzle, reasoning)
            result = "correct" if is_proposed_solution_valid(puzzle, solution, proposed) else "incorrect"
        except:
            proposed = puzzle
            result = 'incorrect'
            reasoning += "The model failed to produce a parseable response."
    
    
        outcomes.append(({'result': result, 'proposed': proposed, 'puzzle': puzzle, 'reasoning': reasoning}))   
        clear_output(wait=True)
        display_outcomes(outcomes)
    print(cb)

Tokens Used: 110824
	Prompt Tokens: 93862
	Completion Tokens: 16962
Successful Requests: 200
Total Cost (USD): $1.4474800000000003


In [28]:
json_string = json.dumps({'outcomes':outcomes})

# Write to file
with open('./data/gpt-4_outcomes', 'w') as file:
    file.write(json_string)

### Solving whole puzzles

In [3]:
def is_fully_solved(puzzle):
    for x in puzzle:
        for y in x:
            if y == 0:
                return False
    return True

In [4]:
# Model to test
model_name = "gpt-4-1106-preview"
model = ChatOpenAI(model=model_name)

#Extraction Model
extraction_model = ChatOpenAI(model="gpt-4-1106-preview")

  warn_deprecated(


In [11]:
games = []
for i in range(20): 
    moves = []
    solution = construct_puzzle_solution()
    puzzle, _, _ = pluck(deepcopy(solution))
    moves.append((puzzle, "", "correct"))
    for j in range(12):
        prompt = brief_analysis.format(puzzle)
        message = model.invoke([   
            HumanMessage(content=prompt)
        ])
        reasoning = message.content
        try:
            proposed = analysis_to_puzzle_solution(extraction_model, puzzle, reasoning)
            clear_output(wait=True)
            display_sudoku_comparison(proposed, puzzle)
            result = "correct" if is_proposed_solution_valid(puzzle, solution, proposed) else "incorrect"
            if proposed:
                puzzle = proposed
            
        except:
            result = 'incorrect'
            reasoning += " The model failed to produce a parseable response."
            proposed = puzzle
        if is_fully_solved(puzzle):
            result = "solved"
        moves.append((puzzle, reasoning, result))   
        if result == "solved" or result == "incorrect":
            break
        puzzle = proposed
    
    games.append({'move' : moves})

0,1,2,3
1.0,3.0,4.0,2.0
2.0,,,
4.0,,,
,1.0,1.0,


In [12]:
games

[{'move': [([[2, 0, 0, 1], [0, 1, 2, 0], [4, 0, 0, 0], [0, 3, 0, 0]],
    '',
    'correct'),
   ([[2, 0, 0, 1], [0, 1, 2, 0], [4, 2, 0, 0], [0, 3, 0, 0]],
    "Upon examining the available rows, columns, and regions, the third row, which contains the numbers 4 and three unsolved cells, stands out. The numbers 1, 2, and 3 are missing in this row. To determine which number goes into a specific cell, we should look at the columns and the 2x2 regions.\n\nFor the third row's second cell (row 3, column 2), we notice that the numbers in the second column are 1 and 3. Since 4 is already in the row, the only possible number for this cell is 2. Therefore, row 3, column 2 must be the number 2.",
    'correct'),
   ([[2, 4, 0, 1], [0, 1, 2, 0], [4, 2, 0, 0], [0, 3, 0, 0]],
    "The row with the most solved cells is row 1 with numbers: 2 and 1. Because each row must contain the digits 1-4 without repeats, and the numbers 2 and 1 are already present, we only have the numbers 3 and 4 left to place i

In [13]:
display_games(games)

In [14]:
json_string = json.dumps({'games':games})

# Write to file
with open('./data/gpt-4-turbo-games', 'w') as file:
    file.write(json_string)