In [2]:
import itertools
import jsonlines
from datasets import load_dataset
from pprint import pprint
from lamini import Lamini
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
# from transformers import AutoModelForSeq2SeqLM

In [3]:
instruction_tuned_dataset = load_dataset("tatsu-lab/alpaca", split="train", streaming=True)

In [4]:
m = 5
print("指令微调数据集是：")
top_m = list(itertools.islice(instruction_tuned_dataset, m))
for j in top_m:
    print(j)

指令微调数据集是：
{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}
{'instruction': 'What are the three primary colors?', 'input': '', 'output': 'The three primary colors are red, blue, and yellow.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat are the three primary colors?\n\

In [5]:
# 定义两个字符串模板：一个用于有输入字段的数据点，另一个用于没有输入字段的数据点
prompt_template_with_input = """下面是一条描述任务的指令，辅以一个提供进一步上下文的输入。请编写一个能合理完成请求的响应。

### Instruction:
{instruction}

### Input:
{input}

### Response:"""

prompt_template_without_input = """下面是一条描述任务的指令。编写一个能合理完成请求的响应。

### Instruction:
{instruction}

### Reponse:"""

In [6]:
# 初始化一个空的列表，用于存放处理后的数据
processed_data = []

# 循环遍历top_m列表中的每一个元素j（你没有给出top_m的定义，我假设它是一个包含多个数据点的列表）
for j in top_m:
  # 判断当前元素j的“input”字段是否为空或不存在
  if not j["input"]:
    # 如果“input”字段为空或不存在，则使用没有输入字段的模板，用j中的“instruction”字段填充
    processed_prompt = prompt_template_without_input.format(instruction=j["instruction"])
  else:
    # 如果“input”字段存在且非空，则使用有输入字段的模板，用j中的“instruction”和“input”字段填充
    processed_prompt = prompt_template_with_input.format(instruction=j["instruction"], input=j["input"])

  # 创建一个新的字典，其中“input”字段是处理后的提示，而“output”字段是j中的“output”字段，并将其添加到processed_data列表中
  processed_data.append({"input": processed_prompt, "output": j["output"]})

In [7]:
pprint(processed_data[1])

{'input': '下面是一条描述任务的指令。编写一个能合理完成请求的响应。\n'
          '\n'
          '### Instruction:\n'
          'What are the three primary colors?\n'
          '\n'
          '### Reponse:',
 'output': 'The three primary colors are red, blue, and yellow.'}


In [8]:
# 以写入模式打开一个名为'alpaca_processed.jsonl'的jsonl文件
with jsonlines.open(f'data/alpaca_processed.jsonl', 'w') as writer:
    # 使用writer的write_all方法将processed_data列表的所有元素写入到jsonl文件中
    writer.write_all(processed_data)

### 模型比较

In [9]:
dataset_path_hf = "lamini/alpaca"
dataset_hf = load_dataset(dataset_path_hf)
print(dataset_hf)

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 52002
    })
})


In [10]:
import lamini
lamini.api_key = "95b701553ffa985833e95cff20d4606ca194adb5614624ce9e9a49ec4e980929"
non_instruct_model = Lamini("meta-llama/Llama-2-7b-hf")
non_instruct_output = non_instruct_model.generate("告诉我如何训练狗狗坐下")
print("Not instruction-tuned output (Llama 2 Base):", non_instruct_output) # 打印响应

Not instruction-tuned output (Llama 2 Base): 来

我的狗狗坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难坐下来。

我想要它坐下来，但是它很难


In [11]:
# 使用该模型生成关于如何训练狗坐下的响应
instruct_model = Lamini("meta-llama/Meta-Llama-3-8B-Instruct")
instruct_output = instruct_model.generate("告诉我如何训练狗狗坐下")
print("instruction-tuned output (Llama 3 Base):", instruct_output) # 打印响应

instruction-tuned output (Llama 3 Base): 
How to Train a Dog to Sit
Training a dog to sit is one of the most basic and essential commands you can teach your furry friend. It's a great way to establish communication and build trust between you and your dog. Here's a step-by-step guide on how to train a dog to sit:

**Step 1: Choose a Quiet and Distraction-Free Area**
Find a quiet area with minimal distractions where your dog can focus on you. Make sure your dog is comfortable and not feeling anxious or stressed.

**Step 2: Have Treats Ready**
Choose your dog's favorite treats and have them ready to use as rewards. You'll need small, tasty treats that your dog will love.

**Step 3: Stand in Front of Your Dog**
Stand in front of your dog and hold a treat close to their nose. Make sure your dog is looking at the treat and not at you.

**Step 4: Move the Treat Up and Back**
Slowly move the treat up and back, towards your dog's tail, while saying "sit" in a calm and clear voice. As you move 

In [2]:
# !pip install transformers==4.32.1

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple, https://pypi.tuna.tsinghua.edu.cn/simple
Collecting transformers==4.32.1
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/83/8d/f65f8138365462ace54458a9e164f4b28ce1141361970190eef36bdef986/transformers-4.32.1-py3-none-any.whl (7.5 MB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.32.1)
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/89/bd/cc9c60a26b19ba012b141cba39c1d425994c78eb2458595caab860d7fa66/tokenizers-0.13.3-cp310-cp310-macosx_10_11_x86_64.whl (4.0 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.3
    Uninstalling tokenizers-0.20.3:
      Successfully uninstalled tokenizers-0.20.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.46.3
    Uninstalling transformers-4.46.3:
      Successfully uninstalled transformers-4.46.3
[31mERROR: pip's dependenc

In [3]:
# 导入必要的模块
tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-70m-deduped",
  revision="step3000",
  cache_dir="./pythia-70m-deduped/step3000",
)



In [8]:

# import transformers
# transformers.logging.set_verbosity_debug()
import torch
from transformers import AutoModelForCausalLM

# !python -m pip install --upgrade pip
# !python -m pip install --upgrade Pillow


#low_cpu_mem_usage=True
# !pip install accelerate
#CPU只能32，dtype=torch.float32而不是16，"LayerNormKernelImpl" not implemented for 'Half'
model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m",device_map='balanced',cache_dir='./cache/',torch_dtype=torch.float32)



In [9]:
# 定义推理函数
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
  # Tokenize：将输入文本转换为Token IDs
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",  # 返回PyTorch张量
          truncation=True,  # 如果文本太长，进行截断
          max_length=max_input_tokens  # 输入文本的最大长度
  )

  # Generate：使用模型生成输出
  device = model.device  # 获取模型所在的设备（CPU或GPU）
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),  # 将输入数据移到相同的设备
    max_length=max_output_tokens  # 输出的最大长度
  )

  # Decode：将生成的Token IDs解码回文本
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt：移除输出中的输入文本，以得到纯粹的回应
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer  # 返回生成的文本

In [5]:
finetuning_dataset_path="lamini/lamini_docs"
finetuning_dataset = load_dataset(finetuning_dataset_path)
print(finetuning_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 140
    })
})


In [6]:
test_sample = finetuning_dataset["test"][0]
print(test_sample)

{'question': 'Can Lamini generate technical documentation or user manuals for software projects?', 'answer': 'Yes, Lamini can generate technical documentation and user manuals for software projects. It uses natural language generation techniques to create clear and concise documentation that is easy to understand for both technical and non-technical users. This can save developers a significant amount of time and effort in creating documentation, allowing them to focus on other aspects of their projects.', 'input_ids': [5804, 418, 4988, 74, 6635, 7681, 10097, 390, 2608, 11595, 84, 323, 3694, 6493, 32, 4374, 13, 418, 4988, 74, 476, 6635, 7681, 10097, 285, 2608, 11595, 84, 323, 3694, 6493, 15, 733, 4648, 3626, 3448, 5978, 5609, 281, 2794, 2590, 285, 44003, 10097, 326, 310, 3477, 281, 2096, 323, 1097, 7681, 285, 1327, 14, 48746, 4212, 15, 831, 476, 5321, 12259, 247, 1534, 2408, 273, 673, 285, 3434, 275, 6153, 10097, 13, 6941, 731, 281, 2770, 327, 643, 7794, 273, 616, 6493, 15], 'attention

In [10]:
print(inference(test_sample["question"], model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.




I have a question about the following:

How do I get the correct documentation to work?

A:

I think you need to use the following code:

A:

You can use the following code to get the correct documentation.

A:

You can use the following code to get the correct documentation.

A:

You can use the following


In [11]:
# 加载微调后的模型
instruction_model = AutoModelForCausalLM.from_pretrained("lamini/lamini_docs_finetuned")
# 使用微调后的模型进行推理
print(inference(test_sample["question"], instruction_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Yes, Lamini can generate technical documentation or user manuals for software projects. This can be achieved by providing a prompt for a specific technical question or question to the LLM Engine, or by providing a prompt for a specific technical question or question. Additionally, Lamini can be trained on specific technical questions or questions to help users understand the process and provide feedback to the LLM Engine. Additionally, Lamini
