In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig


  from .autonotebook import tqdm as notebook_tqdm


In [3]:

model_name = "deepseek-ai/deepseek-math-7b-instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
# model.generation_config = GenerationConfig.from_pretrained(model_name)
# model.generation_config.pad_token_id = model.generation_config.eos_token_id

question = r"""\item \label{affine.subset} Let $V$ be a vector space. A \emph{subset} $H$ of $V$ is called \emph{affine} if there exists a subspace $W$ of $V$ and a point $\vx_0 \in H$ such that
\begin{equation} \label{eq:affine}
H = \{ \vx_0 + \vw : \vw \in W \}.
\end{equation}
We say that $H$ is an affine subset \emph{modelled} on the subspace $W$.

\textbf{Remark.} Some authors use the term ``affine subspace'' to denote an affine subset and ``linear subspace'' to denote the usual notion of subspace. We use affine subset to avoid confusion.
\begin{enumerate}[{$[$}a{$]$}]
\item Show that the point $\vx_0$ is \emph{not} unique. That is, show $H = \{ \vx'_0 + \vw : \vw \in W \}$ for any $\vx_0' \in H$.
"""

statement = r"""

We need to show that if $H = \{ x_0 + w: w \in W\}$ then $H = \{ x_0' + w: w \in W\}$ for any $x_0' \in H$. Then we have two cases. $W$ is the $0$ subspace or $W$ is a subspace with more than 1 element.
If $W$ is a $0$   """
instruction = f"There will be a quesiton and an incomplete proof. Please reason step by step to help complete the latex proof and put your final latex sentence within \\boxed{{}}. The question is {question}. The statement is: \n"
messages = [
    {"role": "user",
     "content": f"""{instruction} {statement}"""
    }]

# input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
# outputs = model.generate(input_tensor.to(model.device), max_new_tokens=input_tensor.shape[1] + 150)

# result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True)
# print(result)


In [4]:
from dataclasses import dataclass
from typing import List, Dict
import datasets as d
import csv
from omegaconf import MISSING


@dataclass
class DatasetConfig:
    instruction: str = MISSING
    inputPath: str = MISSING
    outputPath: str = MISSING
    cutoff: int = 7


class DatasetProcessor:
    def __init__(self, config: DatasetConfig):
        self.config = config

    def read_data(self) -> List[str]:
        with open(self.config.inputPath, "r") as file:
            data = file.read()
        return self._preprocess_data(data)

    def _preprocess_data(self, data: str) -> List[str]:
        sentences = data.split(".")
        return [
            x.strip("\n\\n").replace("\n", " ").replace("  ", " ") for x in sentences
        ]

    def create_dataset(self, data: List[str]) -> List[Dict]:
        dataset = []
        length = len(data)
        for i in range(length):
            sentence = data[i]
            if len(sentence) - 1 < self.config.cutoff:
                continue

            indexes = [x for x in range(self.config.cutoff, len(sentence) - 1)]
            input = [sentence[i:_] for _ in indexes]
            output = [sentence[_:] for _ in indexes]
            for a, b in zip(input, output):
                dataset.append(
                    {
                        "instruction": f"{self.config.instruction}",
                        "input": r"{}".format(a),
                        "output": r"{}".format(b),
                    }
                )

        return dataset

    def save_dataset(self, dataset: List[Dict]) -> None:
        with open(
            self.config.outputPath, mode="w", newline="", encoding="utf-8"
        ) as file:
            writer = csv.DictWriter(file, fieldnames=["instruction", "input", "output"])
            writer.writeheader()
            writer.writerows(dataset)


def process_dataset(config: DatasetConfig) -> None:
    processor = DatasetProcessor(config)
    data = processor.read_data()
    dataset = processor.create_dataset(data)
    processor.save_dataset(dataset)


def get_dataset(config: DatasetConfig):
    process_dataset(config)
    ds = d.load_dataset(config.outputPath[-3:], data_files=config.outputPath)
    return ds


dataCFG = {
    "instruction": instruction,
    "inputPath": "./dataset/latex.txt",
    "outputPath": "./dataset/latex.csv",
    "cutoff": 7,
}

cfg = DatasetConfig(
    instruction=dataCFG["instruction"],
    inputPath=dataCFG["inputPath"],
    outputPath=dataCFG["outputPath"],
)

d = get_dataset(cfg)


  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)
Generating train split: 126195 examples [00:00, 236088.57 examples/s]


In [8]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer
import hydra
from hydra.core.config_store import ConfigStore
from omegaconf import OmegaConf, MISSING
from ds import get_dataset, DatasetConfig
from dataclasses import dataclass
import os
import shutil

@dataclass
class ftConfig:
    model_id: str = MISSING
    precision: str = "bfloat16"
    seed: int = MISSING
    test_size: float = MISSING
    modules_limit: int = MISSING
    r: int = MISSING
    lora_alpha: int = MISSING
    dataCFG: DatasetConfig = MISSING
    prefix_txt: str = MISSING
    per_device_train_batch_size: int = MISSING
    gradient_accumulation_steps: int = MISSING
    optim: str = MISSING
    warmup_steps: float = MISSING
    max_steps: int = MISSING
    learning_rate: float = MISSING
    logging_steps: int = MISSING
    output: str = MISSING
    name: str = MISSING
    overwrite: bool = MISSING

class ModelTrainer:
    def __init__(self, cfg: ftConfig):
        self.cfg = cfg
        self.device = self._get_device()
        self.tokenizer = None
        self.model = None

    def _get_device(self):
        if torch.backends.mps.is_available():
            return "mps"
        return torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def setup_output_directory(self):
        if os.path.exists(f"{self.cfg.output}/{self.cfg.name}/config.yaml"):
            if self.cfg.overwrite:
                shutil.rmtree(f"{self.cfg.output}/{self.cfg.name}/")
            else:
                raise FileExistsError(
                    "Output directory exists. Set overwrite to true to overwrite."
                )

        os.makedirs(f"./{self.cfg.output}/{self.cfg.name}/", exist_ok=True)
        with open(f"./{self.cfg.output}/{self.cfg.name}/config.yaml", "w") as file:
            file.write(str(self.cfg))

    def prepare_dataset(self):
        def generate_prompt(data_point, tokenizer):
            message =  [{
                "role": "user",
                "content": f"""{data_point["instruction"]} {data_point["input"]}"""
            },{
                "role": "assistant",
                "content": data_point["output"]
            }
                        ]

            prompt = tokenizer.apply_chat_template(message, tokenize=False)
            tokenized_prompt = tokenizer(prompt, return_tensors="pt")

            text = {
                'prompt': prompt,
                **tokenized_prompt
            }
            return text

        ds = get_dataset(self.cfg.dataCFG)["train"]
        ds = ds.map(lambda samples: generate_prompt(samples, self.tokenizer), batched=False)
        ds = ds.shuffle(seed=self.cfg.seed)
        ds = ds.train_test_split(test_size=self.cfg.test_size)

        return ds["train"], ds["test"]

    def setup_model(self):
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.cfg.model_id, add_eos_token=True, padding_side="left"
        )

        if self.cfg.precision == "bfloat16":
            dp = torch.bfloat16
        elif self.cfg.precision == "float32":
            dp = torch.float32
        elif self.cfg.precision == "float16":
            dp = torch.float16
        else:
            raise ValueError("Invalid precision value.")


        self.model = AutoModelForCausalLM.from_pretrained(
            self.cfg.model_id,torch_dtype=dp , device_map=self.device
        )
        print(f"Model {self.cfg.model_id} loaded successfully on {self.device} @ {dp} precision.")


        modules = self._find_all_linear_names()
        target_modules = (
            modules
            if len(modules) < self.cfg.modules_limit
            else modules[: self.cfg.modules_limit]
        )

        lora_config = LoraConfig(
            r=self.cfg.r,
            lora_alpha=self.cfg.lora_alpha,
            target_modules=target_modules,
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
        )

        self.model = get_peft_model(self.model, lora_config)
        return lora_config

    def _find_all_linear_names(self):
        lora_module_names = set()
        for name, module in self.model.named_modules():
            if isinstance(module, torch.nn.Linear):
                names = name.split(".")
                lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        return list(lora_module_names)

    def train(self):
        self.setup_output_directory()
        lora_config = self.setup_model()
        train_data, test_data = self.prepare_dataset()

        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "right"

        trainer = SFTTrainer(
            model=self.model,
            train_dataset=train_data,
            eval_dataset=test_data,
            dataset_text_field="prompt",
            peft_config=lora_config,
            max_seq_length=250,
            args=transformers.TrainingArguments(
                per_device_train_batch_size=self.cfg.per_device_train_batch_size,
                gradient_accumulation_steps=self.cfg.gradient_accumulation_steps,
                warmup_steps=self.cfg.warmup_steps,
                max_steps=self.cfg.max_steps,
                learning_rate=self.cfg.learning_rate,
                logging_steps=self.cfg.logging_steps,
                output_dir=f"{self.cfg.output}/{self.cfg.name}/checkpoints",
                optim=self.cfg.optim,
                save_strategy="epoch",
            ),
            data_collator=transformers.DataCollatorForLanguageModeling(
                self.tokenizer, mlm=False
            ),
        )

        trainer.train()
        self.save_models(trainer)

    def save_models(self, trainer):
        new_model_path = f"{self.cfg.output}/{self.cfg.name}/finetuned_models/"
        trainer.model.save_pretrained(new_model_path)

        merged_model = PeftModel.from_pretrained(self.model, new_model_path)
        merged_model = merged_model.merge_and_unload()

        merged_path = f"{self.cfg.output}/{self.cfg.name}/merged_models/"
        merged_model.save_pretrained(merged_path)
        self.tokenizer.save_pretrained(merged_path)

In [11]:
deepseek_cfg = {
    "model_id": "google/gemma-2b-it",
    "seed": 42,
    "test_size": 0.1,
    "modules_limit": 10,
    "r": 1,
    "lora_alpha": 1,
    "dataCFG": cfg,
    "per_device_train_batch_size": 1,
    "gradient_accumulation_steps": 1,
    "optim": "adamw_torch",
    "warmup_steps": 0.1,
    "max_steps": 100,
    "learning_rate": 1e-4,
    "logging_steps": 10,
    "output": "./output",
    "name": "gemma-2b",
    "overwrite": True,
}
deepseek_cfg = ftConfig(
  **deepseek_cfg
)

In [12]:
trainer = ModelTrainer(deepseek_cfg)
trainer.train()

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.67s/it]


Model google/gemma-2b-it loaded successfully on mps @ torch.bfloat16 precision.


  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)
Generating train split: 126195 examples [00:00, 190335.09 examples/s]
Map: 100%|██████████| 126195/126195 [00:34<00:00, 3685.34 examples/s]
Map: 100%|██████████| 113575/113575 [00:07<00:00, 14575.30 examples/s]
Map: 100%|██████████| 12620/12620 [00:00<00:00, 14154.54 examples/s]
 10%|█         | 10/100 [00:09<01:09,  1.29it/s]

{'loss': 3.475, 'grad_norm': 2.0625, 'learning_rate': 9.009009009009008e-05, 'epoch': 0.0}


 20%|██        | 20/100 [00:17<01:00,  1.33it/s]

{'loss': 2.8328, 'grad_norm': 3.375, 'learning_rate': 8.008008008008008e-05, 'epoch': 0.0}


 30%|███       | 30/100 [00:24<00:52,  1.34it/s]

{'loss': 2.0992, 'grad_norm': 4.34375, 'learning_rate': 7.007007007007007e-05, 'epoch': 0.0}


 40%|████      | 40/100 [00:32<00:45,  1.33it/s]

{'loss': 1.5539, 'grad_norm': 5.0625, 'learning_rate': 6.0060060060060066e-05, 'epoch': 0.0}


 50%|█████     | 50/100 [00:39<00:37,  1.32it/s]

{'loss': 1.1508, 'grad_norm': 1.6328125, 'learning_rate': 5.0050050050050046e-05, 'epoch': 0.0}


 60%|██████    | 60/100 [00:47<00:30,  1.33it/s]

{'loss': 0.8266, 'grad_norm': 1.625, 'learning_rate': 4.004004004004004e-05, 'epoch': 0.0}


 70%|███████   | 70/100 [00:54<00:22,  1.33it/s]

{'loss': 0.5766, 'grad_norm': 1.09375, 'learning_rate': 3.0030030030030033e-05, 'epoch': 0.0}


 80%|████████  | 80/100 [01:02<00:15,  1.32it/s]

{'loss': 0.423, 'grad_norm': 0.921875, 'learning_rate': 2.002002002002002e-05, 'epoch': 0.0}


 90%|█████████ | 90/100 [01:10<00:07,  1.33it/s]

{'loss': 0.3391, 'grad_norm': 1.484375, 'learning_rate': 1.001001001001001e-05, 'epoch': 0.0}




{'loss': 0.3146, 'grad_norm': 1.6015625, 'learning_rate': 0.0, 'epoch': 0.0}


100%|██████████| 100/100 [01:18<00:00,  1.27it/s]


{'train_runtime': 78.5995, 'train_samples_per_second': 1.272, 'train_steps_per_second': 1.272, 'train_loss': 1.35916015625, 'epoch': 0.0}


In [13]:

@dataclass
class InferenceConfig:
    path: str = MISSING

class ModelManager:
    def __init__(self, cfg: ftConfig):
        self.device = self._get_device()
        self.model, self.tokenizer = self.load_model(cfg)
        self.device = self._get_device()
        self.instruction = cfg.dataCFG.instruction

    @staticmethod
    def _get_device():
        if torch.cuda.is_available():
            return "cuda"
        if torch.backends.mps.is_available():
            return "mps"
        return "cpu"

    def load_model(self, cfg: InferenceConfig):
        model = AutoModelForCausalLM.from_pretrained(cfg.model_id,
                                                     torch_dtype=torch.bfloat16,
                                                     device_map=self.device)
        merged_model = PeftModel.from_pretrained(
            model, f"{cfg.output}/{cfg.name}/finetuned_models/"
        )
        merged_model = merged_model.merge_and_unload()
        tokenizer = AutoTokenizer.from_pretrained(
            cfg.model_id, add_eos_token=True, padding_side="left"
        )
        return merged_model, tokenizer

    def generate_prompt(self, input: str) -> str:
        message = [{
            "role": "user",
            "content": f"{self.instruction} {input}"
        }]
        print(message)

        return self.tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt")

    def __call__(self, input: str) -> str:
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "left"

        input_tensor = self.generate_prompt(input)
        print(input_tensor)
        outputs = self.model.generate(
            input_tensor.to(self.device),
            max_new_tokens=100,
            do_sample=True,
            pad_token_id=self.tokenizer.eos_token_id
        )

        text = self.tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True)
        return text

def setup_config_store():
    cs = ConfigStore.instance()
    cs.store(name="inference_config", node=InferenceConfig)


model_manager = ModelManager(deepseek_cfg)


Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.86s/it]


In [14]:

prompt_input = r"""Pick any point $s$ in $H$ since it is not empty. Then we have to show that the set $S = \{h - s: h \in H \}$ is a linear subspace. Then $0 \in S$ since when $h = s $. For scalar multiplication we have to show that if $h - s \in S$ then $c(h-s) \in S$ where $c \in F$. From the CL property we have $(1-c)s + ch \in F$. Thus we have $b := s + c(h-s) \in H$. Thus we have $b - s \in S$ by definition of $S$ and thus $c(h-s) \in S$. To show scalar addition if we have $u = h - s, v = h' - s \in S$ then $u + v \in S$. Adding both we have $u + v = h + h' - 2s$. Thus to show that this is in $S$ we need to show that $h + h' - s \in H$. From CL and the fact that $char F \neq 2$ then that means $(1-2)s + 2h = 2h -s\in H$ and $(1-2)s + 2h' = 2h' - s\in H$. Thus since $2 \in F \ """
print(model_manager(prompt_input))

[{'role': 'user', 'content': "There will be a quesiton and an incomplete proof. Please reason step by step to help complete the latex proof and put your final latex sentence within \\boxed{}. The question is \\item \\label{affine.subset} Let $V$ be a vector space. A \\emph{subset} $H$ of $V$ is called \\emph{affine} if there exists a subspace $W$ of $V$ and a point $\\vx_0 \\in H$ such that\n\\begin{equation} \\label{eq:affine}\nH = \\{ \\vx_0 + \\vw : \\vw \\in W \\}.\n\\end{equation}\nWe say that $H$ is an affine subset \\emph{modelled} on the subspace $W$.\n\n\\textbf{Remark.} Some authors use the term ``affine subspace'' to denote an affine subset and ``linear subspace'' to denote the usual notion of subspace. We use affine subset to avoid confusion.\n\\begin{enumerate}[{$[$}a{$]$}]\n\\item Show that the point $\\vx_0$ is \\emph{not} unique. That is, show $H = \\{ \\vx'_0 + \\vw : \\vw \\in W \\}$ for any $\\vx_0' \\in H$.\n. The statement is: \n Pick any point $s$ in $H$ since it 

In [7]:

prompt_input = r"""Pick any point $s$ in $H$ since it is not empty.
Then we have to show that the set $S = \{h - s: h \in H \}$ is a linear subspace.
Then $0 \in S$ since when $h = s $. For scalar multiplication we have to show that if
$h - s \in S$ then $c(h-s) \in S$ where $c \in F$. From the CL property we have $(1-c)s +
ch \in F$. Thus we have $b := s + c(h-s) \in H$. Thus we have $b - s \in S$ by definition of $S$
and thus $c(h-s) \in S$. To show scalar addition if we have $u = h - s, v = h' - s \in S$ then
$u + v \in S$. Adding both we have $u + v = h + h' - 2s$. Thus to show that this is in $S$
we need to show that $h + h' - s \in H$. From CL and the fact that $char F \neq 2$
then that means $(1-2)s + 2h = 2h -s\in H$ and $(1-2)s + 2h' = 2h' - s\in H$.
Thus since $2 \in F \ """

print(model_manager(prompt_input))

[{'role': 'user', 'content': "There will be a quesiton and an incomplete proof. Please reason step by step to help complete the latex proof and put your final latex sentence within \\boxed{}. The question is \\item \\label{affine.subset} Let $V$ be a vector space. A \\emph{subset} $H$ of $V$ is called \\emph{affine} if there exists a subspace $W$ of $V$ and a point $\\vx_0 \\in H$ such that\n\\begin{equation} \\label{eq:affine}\nH = \\{ \\vx_0 + \\vw : \\vw \\in W \\}.\n\\end{equation}\nWe say that $H$ is an affine subset \\emph{modelled} on the subspace $W$.\n\n\\textbf{Remark.} Some authors use the term ``affine subspace'' to denote an affine subset and ``linear subspace'' to denote the usual notion of subspace. We use affine subset to avoid confusion.\n\\begin{enumerate}[{$[$}a{$]$}]\n\\item Show that the point $\\vx_0$ is \\emph{not} unique. That is, show $H = \\{ \\vx'_0 + \\vw : \\vw \\in W \\}$ for any $\\vx_0' \\in H$.\n. The statement is: \n Pick any point $s$ in $H$ since it 

In [16]:
question = r"""\item \label{affine.subset} Let $V$ be a vector space. A \emph{subset} $H$ of $V$ is called \emph{affine} if there exists a subspace $W$ of $V$ and a point $\vx_0 \in H$ such that
\begin{equation} \label{eq:affine}
H = \{ \vx_0 + \vw : \vw \in W \}.
\end{equation}
We say that $H$ is an affine subset \emph{modelled} on the subspace $W$.

\textbf{Remark.} Some authors use the term ``affine subspace'' to denote an affine subset and ``linear subspace'' to denote the usual notion of subspace. We use affine subset to avoid confusion.
\begin{enumerate}[{$[$}a{$]$}]
\item Show that the point $\vx_0$ is \emph{not} unique. That is, show $H = \{ \vx'_0 + \vw : \vw \in W \}$ for any $\vx_0' \in H$.
"""

statement = r""" We need to show that if $H = \{ x_0 + w: w \in W\}$ then $H = \{ x_0' + w: w \in W\}$ for any $x_0' \in H$. Then we have two cases. $W$ is the $0$ subspace or $W$ is a subspace with more than 1 element.
If $W$ is a $0$   """
instruction = f"There will be a quesiton and an incomplete proof. Please reason step by step to help complete the latex proof and put your final latex sentence within \\boxed{{}}. The question is {question}. The statement is: \n"
messages = [
    {"role": "user",
     "content": f"""{instruction} {statement}"""
    }]

input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
outputs = model.generate(input_tensor.to(model.device), max_new_tokens=1000)

result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True)
print(result)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


 The statement is: 
  We need to show that if $H = \{ x_0 + w: w \in W\}$ then $H = \{ x_0' + w: w \in W\}$ for any $x_0' \in H$. Then we have two cases. $W$ is the $0$ subspace or $W$ is a subspace with more than 1 element.
If $W$ is a $0$ subspace, then $H = \{ x_0 + 0: x_0 \in V\} = V$. In this case, any $x_0' \in H$ satisfies $H = \{ x_0' + w: w \in W\}$.
If $W$ is a subspace with more than 1 element, then for any $x_0' \in H$, we have $x_0' = x_0 + w$ for some $w \in W$. Then $H = \{ x_0 + w: w \in W\} = \{ x_0' + (w - w): w \in W\} = \{ x_0' + w: w \in W\}$.
Therefore, the point $x_0$ is not unique.

The answer is $\boxed{True}$.


In [14]:
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)


User: There will be a quesiton and an incomplete proof. Please reason step by step to help complete the latex proof and put your final latex sentence within \boxed{}. The question is \item \label{affine.subset} Let $V$ be a vector space. A \emph{subset} $H$ of $V$ is called \emph{affine} if there exists a subspace $W$ of $V$ and a point $\vx_0 \in H$ such that
\begin{equation} \label{eq:affine}
H = \{ \vx_0 + \vw : \vw \in W \}.
\end{equation}
We say that $H$ is an affine subset \emph{modelled} on the subspace $W$.

\textbf{Remark.} Some authors use the term ``affine subspace'' to denote an affine subset and ``linear subspace'' to denote the usual notion of subspace. We use affine subset to avoid confusion.
\begin{enumerate}[{$[$}a{$]$}]
\item Show that the point $\vx_0$ is \emph{not} unique. That is, show $H = \{ \vx'_0 + \vw : \vw \in W \}$ for any $\vx_0' \in H$.
. The statement is: 
  We need to show that if $H = \{ x_0 + w: w \in W\}$ then $H = \{ x_0' + w: w \in W\}$ for any $x_0

In [15]:
print(input_tensor.shape)

torch.Size([1, 373])
