In [None]:
# 套件安裝
!pip install opencc opencc-python-reimplemented datasets ipywidgets IProgress  openpyxl pandas

In [13]:
# 連線資料
!rm factory_data
!rm factory_saves
!ln -s /storage/factory_data .
!ln -s /storage/factory_saves .

rm: cannot remove 'factory_saves': No such file or directory


In [None]:
# 模型下載
from huggingface_hub import snapshot_download

model_id = "INX-TEXT/Bailong-instruct-7B" # hugginFace's model name
snapshot_download(
    repo_id=model_id, 
    local_dir="INX-TEXT_Bailong-instruct-7B",
    local_dir_use_symlinks=False,
    revision="main",
    use_auth_token="<YOUR_HF_ACCESS_TOKEN>")

In [3]:
# CP資料製作
import json
from datasets import load_dataset

# HF資料集 Example01
dataset_dict = load_dataset(
    "AWeirdDev/zh-tw-articles-2k",
    cache_dir="cache",  # 方便清理
    streaming=True,  # 啟用此選項，避免整份資料集被下載到硬碟裡面
)
dataset = dataset_dict['train']

# 轉檔
extracted_dataset = []
for _, data in zip(range(1000), dataset):
#for data in dataset:
    extracted_data = {
        "text":  data["content"]      
    }

    extracted_dataset.append(extracted_data)

# 內容寫進檔案
with open("data_cp.json", "wt", encoding="UTF-8") as fp:
    json.dump(extracted_dataset, fp, ensure_ascii=False, indent=4) 

In [4]:
# FT 資料製作
import json
import opencc
from datasets import load_dataset
# s2t: 簡體到正體, s2twp:簡體到台灣正體
op_cc=opencc.OpenCC('s2twp')

# HF資料集 Example02
dataset_dict = load_dataset(
    "ticoAg/Chinese-medical-dialogue",
    cache_dir="cache",  # 方便清理
    streaming=True,  # 啟用此選項，避免整份資料集被下載到硬碟裡面
)
dataset = dataset_dict['train']

# 轉檔
extracted_dataset = []
for _, data in zip(range(1000), dataset):
#for data in dataset:
    extracted_data = {
        "instruction":  op_cc.convert(data["instruction"]),    
        "input":  op_cc.convert(data["input"]),
        "output":  op_cc.convert(data["output"]),        
        "system":  "You are a helpful AI assistant built by NCHC. The user you are helping speaks Traditional Chinese and comes from Taiwan."        
    }

    extracted_dataset.append(extracted_data)

# 內容寫進檔案
with open("data_ft.json", "wt", encoding="UTF-8") as fp:
    json.dump(extracted_dataset, fp, ensure_ascii=False, indent=4) 


Downloading readme: 100%|██████████| 2.37k/2.37k [00:00<00:00, 14.2MB/s]


In [6]:
# RLHF
import json
from datasets import load_dataset

# s2t: 簡體到正體, s2twp:簡體到台灣正體
#op_cc=opencc.OpenCC('s2twp')

# HF資料集 Example03
dataset_dict = load_dataset(
    "LawChat-tw/RLHF_data",
    cache_dir="cache",  # 方便清理
    streaming=True,  # 啟用此選項，避免整份資料集被下載到硬碟裡面
)
dataset = dataset_dict['train']

# 轉檔
extracted_dataset = []
for _, data in zip(range(1000), dataset):
#for data in dataset:
    extracted_data = {
        "question": data["prompt"],
        "answer": [data["chosen"],data["rejected"]],
        "system": "You are an AI assistant. You will be given a task. You must generate a detailed and long answer."
    }

    extracted_dataset.append(extracted_data)

# 內容寫進檔案
with open("data_rlhf.json", "wt", encoding="UTF-8") as fp:
    json.dump(extracted_dataset, fp, ensure_ascii=False, indent=4) 



Downloading readme: 100%|██████████| 512/512 [00:00<00:00, 3.98MB/s]


In [7]:
# Excel
import json
import openpyxl

# Excel資料集
input_file="data/14-SFT_MedQA2019.xlsx"
wb = openpyxl.load_workbook(input_file)
sheet = wb["DrugQA"]

# 轉檔
extracted_dataset = []
for row in sheet.iter_rows(min_row=2, max_col=4, values_only=True):
    extracted_data = {
        "instruction": row[0],
        "input":  "",
        "output": row[3],      
        "system":  "You are a helpful AI assistant built by NCHC. The user you are helping speaks Traditional Chinese and comes from Taiwan."        
    }    
    
    extracted_dataset.append(extracted_data)

# 內容寫進檔案
with open("data_excel.json", "wt", encoding="UTF-8") as fp:
    json.dump(extracted_dataset, fp, ensure_ascii=False, indent=4) 


In [8]:
# JSON
import pandas as pd
import json

# JSON資料集
input_file="data/15-SFT_baike.json"
data = pd.read_json ( input_file )
df = pd.DataFrame(data) # 轉成 DataFrame

# 轉檔
extracted_dataset = []
for index, row in df.iterrows():
    extracted_data = {
        "instruction": row['instruction'],
        "input":  row['input'],
        "output": row['output'],
        "system":  "You are a helpful AI assistant built by NCHC. The user you are helping speaks Traditional Chinese and comes from Taiwan."                
    }
    
    extracted_dataset.append(extracted_data)

# 內容寫進檔案
with open("data_json.json", "wt", encoding="UTF-8") as fp:
    json.dump(extracted_dataset, fp, ensure_ascii=False, indent=4) 



In [9]:
# Identity json
import opencc
import pandas as pd
import json
# s2t: 簡體到正體, s2twp:簡體到台灣正體
op_cc=opencc.OpenCC('s2twp')

## 取代字元
NAME = "c00cjz00"
AUTHOR = "國網中心"
with open("data/16-SFT_identity.json", "r", encoding="utf-8") as f:
  dataset = json.load(f)

for sample in dataset:
  sample["output"] = sample["output"].replace("NAME", NAME).replace("AUTHOR", AUTHOR)

with open("identity.json", "w", encoding="utf-8") as f:
  json.dump(dataset, f, indent=2, ensure_ascii=False)


# JSON資料集
input_file="identity.json"
data = pd.read_json ( input_file )
df = pd.DataFrame(data) # 轉成 DataFrame

# 轉檔
extracted_dataset = []
for index, row in df.iterrows():
    instruction = op_cc.convert(row['instruction'])
    input = op_cc.convert(row['input'])     
    output = op_cc.convert(row['output'])     
    extracted_data = {
        "instruction": instruction,
        "input":  input,
        "output": output
    }
    #for num in range(1, 10):
    extracted_dataset.append(extracted_data)

# 內容寫進檔案
with open("data_identity.json", "wt", encoding="UTF-8") as fp:
    json.dump(extracted_dataset, fp, ensure_ascii=False, indent=4) 



In [11]:
# parquet
import pandas as pd
import json

# JSON資料集
input_file="data/17-SFT_train.parquet"
data = pd.read_parquet(input_file, engine='pyarrow')

df = pd.DataFrame(data) # 轉成 DataFrame

# 轉檔
extracted_dataset = []
for index, row in df.iterrows():
    extracted_data = {
        "instruction": row['prompt'],
        "input": "",
        "output": row['response'],
        "system":  "You are a helpful AI assistant built by NCHC. The user you are helping speaks Traditional Chinese and comes from Taiwan."                
    }
    
    extracted_dataset.append(extracted_data)

# 內容寫進檔案
with open("data_parquet.json", "wt", encoding="UTF-8") as fp:
    json.dump(extracted_dataset, fp, ensure_ascii=False, indent=4) 

