In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

model_name = "deepseek-ai/deepseek-math-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id


  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Downloading shards: 100%|██████████| 2/2 [20:07<00:00, 603.84s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:35<00:00, 17.70s/it]


 The integral of x^2 from 0 to 2 is given by the antiderivative of x^2 evaluated at the endpoints 0 and 2.
The antiderivative of x^2 is (1/3)x^3.
So, the integral is (1/3)(2^3) - (1/3)(0^3) = (1/3)(8) - (1/3)(0) = 8/3


In [13]:
statement = r"""
Since $\Omega$ is countable then that means there exists a mapping from $\mathbb{N} \mapsto \Omega$. Therefore for each $\omega \in \Omega$ we can write $A_1 \mapsto \omega_1$, $A_2 \mapsto \omega_2$, $A_3 \mapsto \omega_3$, and so forth where $\omega_i$ is the $i^{\text{th}}$ element in $\Omega$. Since $F_{\omega_i}$ is countable then that means there exists a mapping from the natural numbers to $F_{\omega_i}$. Therefore we can write  """

messages = [
    {"role": "user",
     "content": f"""Please reason step by step to complete the latex proof and put your final latex answer within \\boxed{{}}. The statement is:\n{statement}"""
    }]

input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
outputs = model.generate(input_tensor.to(model.device), max_new_tokens=50)

result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True)
print(result)


 Since $\Omega$ is countable then that means there exists a mapping from $\mathbb{N} \mapsto \Omega$. Therefore for each $\omega \in \Omega$ we can write $A_1 \mapsto \omega_1$, $A


In [14]:
from dataclasses import dataclass
from typing import List, Dict
import datasets as d
import csv
from omegaconf import MISSING


@dataclass
class DatasetConfig:
    instruction: str = MISSING
    inputPath: str = MISSING
    outputPath: str = MISSING
    cutoff: int = 7


class DatasetProcessor:
    def __init__(self, config: DatasetConfig):
        self.config = config

    def read_data(self) -> List[str]:
        with open(self.config.inputPath, "r") as file:
            data = file.read()
        return self._preprocess_data(data)

    def _preprocess_data(self, data: str) -> List[str]:
        sentences = data.split(".")
        return [
            x.strip("\n\\n").replace("\n", " ").replace("  ", " ") for x in sentences
        ]

    def create_dataset(self, data: List[str]) -> List[Dict]:
        dataset = []
        length = len(data)
        for i in range(length):
            sentence = data[i]
            if len(sentence) - 1 < self.config.cutoff:
                continue

            indexes = [x for x in range(self.config.cutoff, len(sentence) - 1)]
            input = [sentence[i:_] for _ in indexes]
            output = [sentence[_:] for _ in indexes]
            for a, b in zip(input, output):
                dataset.append(
                    {
                        "instruction": f"{self.config.instruction}",
                        "input": r"{}".format(a),
                        "output": r"{}".format(b),
                    }
                )

        return dataset

    def save_dataset(self, dataset: List[Dict]) -> None:
        with open(
            self.config.outputPath, mode="w", newline="", encoding="utf-8"
        ) as file:
            writer = csv.DictWriter(file, fieldnames=["instruction", "input", "output"])
            writer.writeheader()
            writer.writerows(dataset)


def process_dataset(config: DatasetConfig) -> None:
    processor = DatasetProcessor(config)
    data = processor.read_data()
    dataset = processor.create_dataset(data)
    processor.save_dataset(dataset)


def get_dataset(config: DatasetConfig):
    process_dataset(config)
    ds = d.load_dataset(config.outputPath[-3:], data_files=config.outputPath)
    return ds


dataCFG = {
    "instruction": "Please reason step by step to complete the latex proof and put your final latex answer within \\boxed{{}}. The statement is:\n",
    "inputPath": "./dataset/latex.txt",
    "outputPath": "./dataset/latex.csv",
    "cutoff": 7,
}

cfg = DatasetConfig(
    instruction=dataCFG["instruction"],
    inputPath=dataCFG["inputPath"],
    outputPath=dataCFG["outputPath"],
)

process_dataset(cfg)


In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer
import hydra
from hydra.core.config_store import ConfigStore
from omegaconf import OmegaConf, MISSING
from ds import get_dataset, DatasetConfig
from dataclasses import dataclass
import os
import shutil

class ModelTrainer:
    def __init__(self, cfg: ftConfig):
        self.cfg = cfg
        self.device = self._get_device()
        self.tokenizer = None
        self.model = None

    def _get_device(self):
        if torch.backends.mps.is_available():
            return "mps"
        return torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def setup_output_directory(self):
        if os.path.exists(f"{self.cfg.output}/{self.cfg.name}/config.yaml"):
            if self.cfg.overwrite:
                shutil.rmtree(f"{self.cfg.output}/{self.cfg.name}/")
            else:
                raise FileExistsError(
                    "Output directory exists. Set overwrite to true to overwrite."
                )

        os.makedirs(f"./{self.cfg.output}/{self.cfg.name}/", exist_ok=True)
        with open(f"./{self.cfg.output}/{self.cfg.name}/config.yaml", "w") as file:
            file.write(OmegaConf.to_yaml(self.cfg))

    def prepare_dataset(self):
        def generate_prompt(data_point):
            prefix_text = self.cfg.prefix_txt
            text = r""" <start_of_turn>user {prefix_text} {instruction}  {input} <end_of_turn> <start_of_turn>model {output} <end_of_turn>""".format(
                prefix_text=prefix_text,
                instruction=data_point["instruction"],
                input=data_point["input"],
                output=data_point["output"],
            )

            return text

        ds = get_dataset(self.cfg.dataCFG)
        text_column = [generate_prompt(data_point) for data_point in ds["train"]]
        ds = ds["train"].add_column("prompt", text_column)
        ds = ds.shuffle(seed=self.cfg.seed)
        ds = ds.map(lambda samples: self.tokenizer(samples["prompt"]), batched=True)
        ds = ds.train_test_split(test_size=self.cfg.test_size)
        return ds["train"], ds["test"]

    def setup_model(self):
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.cfg.model_id, add_eos_token=True, padding_side="left"
        )
        self.model = AutoModelForCausalLM.from_pretrained(
            self.cfg.model_id, device_map=self.device
        )
        self.model.gradient_checkpointing_enable()

        modules = self._find_all_linear_names()
        target_modules = (
            modules
            if len(modules) < self.cfg.modules_limit
            else modules[: self.cfg.modules_limit]
        )

        lora_config = LoraConfig(
            r=self.cfg.r,
            lora_alpha=self.cfg.lora_alpha,
            target_modules=target_modules,
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
        )

        self.model = get_peft_model(self.model, lora_config)
        return lora_config

    def _find_all_linear_names(self):
        lora_module_names = set()
        for name, module in self.model.named_modules():
            if isinstance(module, torch.nn.Linear):
                names = name.split(".")
                lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        return list(lora_module_names)

    def train(self):
        self.setup_output_directory()
        lora_config = self.setup_model()
        train_data, test_data = self.prepare_dataset()

        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "right"

        trainer = SFTTrainer(
            model=self.model,
            train_dataset=train_data,
            eval_dataset=test_data,
            dataset_text_field="prompt",
            peft_config=lora_config,
            max_seq_length=250,
            args=transformers.TrainingArguments(
                per_device_train_batch_size=self.cfg.per_device_train_batch_size,
                gradient_accumulation_steps=self.cfg.gradient_accumulation_steps,
                warmup_steps=self.cfg.warmup_steps,
                max_steps=self.cfg.max_steps,
                learning_rate=self.cfg.learning_rate,
                logging_steps=self.cfg.logging_steps,
                output_dir=f"{self.cfg.output}/{self.cfg.name}/checkpoints",
                optim=self.cfg.optim,
                save_strategy="epoch",
            ),
            data_collator=transformers.DataCollatorForLanguageModeling(
                self.tokenizer, mlm=False
            ),
        )

        trainer.train()
        self.save_models(trainer)

    def save_models(self, trainer):
        new_model_path = f"{self.cfg.output}/{self.cfg.name}/finetuned_models/"
        trainer.model.save_pretrained(new_model_path)

        merged_model = PeftModel.from_pretrained(self.model, new_model_path)
        merged_model = merged_model.merge_and_unload()

        merged_path = f"{self.cfg.output}/{self.cfg.name}/merged_models/"
        merged_model.save_pretrained(merged_path)
        self.tokenizer.save_pretrained(merged_path)