In [54]:
from sgfmill import sgf
from sgfmill import boards


In [None]:
import json
import glob
def get_data(sgf_game):
  board_size = sgf_game.get_size()
  board = boards.Board(board_size)

  nodes = sgf_game.get_main_sequence()
  step = 1
  data_pairs = []

  moves = []
  for node in nodes:
      move = node.get_move()
      color, coords = move
      
      # If we have a move, apply it to the board and track it in 'moves'
      if color is not None and coords is not None:
          row, col = coords
        #   print(row, col)
        #   print(color)
          # Skip if position is already occupied
          if not board.get(row, col) is None:
            #   print(f"Warning: Position ({row}, {col}) is already occupied - skipping move")
              continue
              

          board.play(row, col, colour=color)
          # Convert row,col to something like 'D4' if needed
          # or keep them as (row, col) – just be consistent in your prompt
          # For simplicity, do something like:
          move_str = f"{'B' if color=='b' else 'W'} ({row},{col})"
          moves.append(move_str)

      # If there's a comment, create a training pair
      if "C" in node.properties():
          try:
              comment = node.get("C")
          except UnicodeDecodeError:
              # Try different encodings
              try:
                  # Try getting raw bytes and decode with different encodings
                  raw_comment = node.properties()["C"][0]
                  for encoding in ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']:
                      try:
                          comment = raw_comment.decode(encoding)
                          break
                      except UnicodeDecodeError:
                          continue
                  else:
                      print(f"Warning: Could not decode comment - skipping")
                      continue
              except:
                  print(f"Warning: Could not process comment - skipping")
                  continue

          if comment:
              # Build your 'prompt' string
              # E.g., "Board size: 19\nMoves so far:\n1. B (3,3)\n2. W (16,16)\n..."
              prompt_str = f"Board size: {board_size}\nMoves so far:\n"
              prompt_str += "\n".join([f"{i+1}. {m}" for i, m in enumerate(moves)])

              # Optionally add system-like instructions or metadata  
              system_str = (
                  "You are a professional Go teacher. "
                  "Please provide commentary on the current position."
              )
              # You can store 'system_str' inside the prompt or handle it separately
              # (depending on your training pipeline's approach to system prompts).

              # Now store the pair
              data_pairs.append({
                  "prompt": system_str + "\n\n" + prompt_str,
                  "response": comment
              })

      step += 1

  return data_pairs

def load_data(sgf_file):
    # Load the SGF file
    try:
        with open(sgf_file, "rb") as f:
            # Try loading with default encoding first
            try:
                sgf_game = sgf.Sgf_game.from_bytes(f.read())
            except ValueError as e:
                if "unknown encoding" in str(e):
                    # If encoding error, try reading the file again with a forced UTF-8 encoding
                    f.seek(0)  # Reset file pointer
                    content = f.read()
                    # Force UTF-8 encoding by modifying the SGF data
                    content = content.replace(b"CA[", b"CA[UTF-8][")
                    sgf_game = sgf.Sgf_game.from_bytes(content)
                else:
                    raise e

        data_pairs = get_data(sgf_game)
        return data_pairs
    except Exception as e:
        print(f"Error processing {sgf_file}: {str(e)}")
        return []


data_pairs = []
for sgf_file in glob.glob("data/*.sgf"):
    print(sgf_file)
    data_pairs.extend(load_data(sgf_file))
    # break

# Save to JSON
with open("my_go_data.json", "w", encoding="utf-8") as f:
    json.dump(data_pairs, f, ensure_ascii=False, indent=2)
