In [None]:
%pwd

In [None]:
%ls

In [None]:
import sys
import os

import torch
import random
from pathlib import Path
from dataclasses import dataclass
from sklearn.model_selection import train_test_split

import re

In [None]:
# add parent directory to python search paths
absolute_parent_dir = os.path.abspath(os.path.pardir)
if not absolute_parent_dir in sys.path:
    sys.path.append(absolute_parent_dir)
    for path in sys.path:
        print(path)

In [None]:
from claim_gpt.create_files.create_files import create_files
from claim_gpt.create_model.create_model import create_model
from claim_gpt.train_model.train_model import train_model
from claim_gpt.validate_model.validate_model import validate_model

from corpus_base.corpus01.create_corpus01 import create_corpus01

from shared import Encoder
from shared import load_model

# output_folder_path

In [None]:
output_folder_path = Path('math_gpt_output/').resolve()
corpus_folder_path = output_folder_path.joinpath('corpus')
models_folder_path = output_folder_path.joinpath('models/').resolve() # just one such folder for all models
model_folder_path = models_folder_path.joinpath('model/').resolve() # can have multiple such paths (using different folder names)

if not output_folder_path.exists():
    os.makedirs(output_folder_path)
    print(f'output_folder_path={output_folder_path}')
if not corpus_folder_path.exists():
    os.makedirs(corpus_folder_path)
    print(f'output_folder_path={corpus_folder_path}')
if not models_folder_path.exists():
    os.makedirs(models_folder_path)
    print(f'output_folder_path={models_folder_path}')
if not model_folder_path.exists():
    os.makedirs(model_folder_path)
    print(f'output_folder_path={model_folder_path}')

# Create corpus01.txt

In [None]:
corpus01_file_path = create_corpus01()
print(f'corpus01_file_path={corpus01_file_path}')

# Settings

In [None]:
@dataclass
class Settings():
    block_size: int = 150 # max dictum size
    limit_count = 1000 * 50 # max for corpus FIXME: not quite right

settings = Settings()

# Create corpus

In [None]:
%%time
# create claim corpus
block_size = settings.block_size
limit_count = settings.limit_count
if output_folder_path.joinpath('corpus/corpus.txt').exists():
    corpus_file_path = output_folder_path.joinpath('corpus/corpus.txt')
else:
    corpus_file_path = create_files(output_folder_path, corpus01_file_path, block_size=block_size, limit_count=limit_count)
    with open(corpus_file_path, 'r') as file:
        corpus_statements = file.read().splitlines()
        X_train, X_test = train_test_split(corpus_statements, test_size=0.2, random_state=42)
        with open(corpus_file_path.parent.joinpath("train_corpus.txt"), "w") as outfile:
            outfile.write("\n".join(X_train))
        with open(corpus_file_path.parent.joinpath("test_corpus.txt"), "w") as outfile:
            outfile.write("\n".join(X_test))
print(f'corpus_file_path={corpus_file_path}')

# Create Encoder

In [None]:
encoder = Encoder.load_from_json(corpus_folder_path=corpus_file_path.parent)

# Create model

In [None]:
def set_up_model(corpus_file_path: Path, model_info: (str, int, int), output_folder_path: Path) -> Path:
    model_name, n_head, n_layer = model_info
    model_folder_path = output_folder_path.joinpath('models/').resolve()
    model_file_path = model_folder_path.joinpath(model_name).resolve()
    create_model(model_file_path=model_file_path, corpus_file_path=corpus_file_path, n_head=n_head, n_layer=n_layer)
    return model_file_path

In [None]:
model_info = ('model/model.pt', 10, 10) # model_name, n_head, n_layer
model_file_path = set_up_model(corpus_file_path=corpus_file_path, model_info=model_info, output_folder_path=output_folder_path)
print(f'model_file_path={model_file_path}')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.set_default_device(device)

print(f'device={device}')
print(f'loading model and optimizer from checkpoint={model_file_path}')
model, optimizer = load_model(model_checkpoint_path=model_file_path, device=device, encoder=encoder)
print(f'model.device={model.device}')

# Train model

In [None]:
%%time
if model.device == 'cpu':
    max_train_epochs = 10 * 1 * 1 * 1 # cpu: 10 * 10 * 5 is about 80 minutes
else:
    max_train_epochs = 100 * 10 * 10 * 1 # gpu fast: 100 * 10 * 1 is about 5 minutes
train_corpus_file_path = corpus_file_path.parent.joinpath('train_corpus.txt')
train_model(model, optimizer, max_train_epochs=max_train_epochs, corpus_file_path=train_corpus_file_path, model_file_path=model_file_path)
print('Done')

# Validate model

In [None]:
%%time
if model.device == 'cpu':
    max_examples=10 * 1 # better the model, the more examples one can do quicker
else:
    max_examples=100 * 1 # better the model, the more examples one can do quicker
max_print_error = 10
max_print_ok = 3
if max_examples > 0:
    print(f'--- validate model (count={max_examples}) ---')
    test_corpus_file_path = corpus_file_path.parent.joinpath('test_corpus.txt')
    validate_model(model=model, max_examples=max_examples, max_print_error=max_print_error, max_print_ok=max_print_ok, corpus_file_path=test_corpus_file_path)
print('Done')

# Simulate Deployment

Note: not the best coding

Note:
We selected prefixes = ['ax-mp ', 'mp2 ', 'mp2b ', 'mpd ', 'syl '] in create_corpus.py.

But ax_mp appears most often.

TODO: should include these prefixes in the 'corpus' to be able to better filter.

TODO: add some more prefixes ???

TODO: print how many cases are there for each prefix.

In [None]:
def print_row(row):
    x = re.split(r"(?=<\|start_claim\|> | <\|given\|> | <\|conclude\|> | <\|end_claim\|>)", row)
    for item in x:
        print(item)

In [None]:
from shared import get_encoded_statement
from shared import generate_predicted_dictum

def simulate_deployment(test_corpus_statements):
    statement = None
    prompt = ''
    reply = 'error'
    conclusion_token = encoder.stoi['<|conclude|>']
    for _ in range(1):
        random_statement = random.choice(test_corpus_statements)
        encoded_val_statement = get_encoded_statement(random_statement, encoder, block_size)
        val_statement = encoder.decode(encoded_val_statement)
        random_prompt = val_statement.split(' <|conclude|> ')[0] + ' <|conclude|>'
        terminal_token = '<|end_claim|>'
        predicted_dictum = generate_predicted_dictum(prompt=random_prompt, terminal_token=terminal_token, model=model)
        statement = random_statement
        prompt = random_prompt
        if random_statement == predicted_dictum:
            reply = predicted_dictum.split(' <|conclude|> ')[1]
            break
    return statement, prompt, reply

In [None]:
model.eval()
test_corpus_file_path = corpus_file_path.parent.joinpath('test_corpus.txt')
print(f'test_corpus_file_path={test_corpus_file_path}')
with open(test_corpus_file_path, 'r') as file:
    test_corpus_statements = file.read().splitlines()
print(f'test_corpus_statement_count={len(test_corpus_statements)}')

mp_test_corpus_statements = [x for x in test_corpus_statements if x.count('given') > 2]
print(f'mp_test_corpus_statements={len(mp_test_corpus_statements)}')

In [None]:
# Rerun this cell to simulate deployment
statement, prompt, reply = simulate_deployment(test_corpus_statements)
print_row('You:' + prompt + ' ')
print_row('Model:\n' + reply + ' ')

In [None]:
# Rerun this cell to simulate deployment for prefixes mp2 and mp2b
statement, prompt, reply = simulate_deployment(mp_test_corpus_statements)
print_row('You:' + prompt + ' ')
print_row('Model:\n' + reply + ' ')