<a href="https://colab.research.google.com/github/bibekyess/dacon-hansol-deco-challenge/blob/main/Orion_QLoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import

#### If you are running on local environment, please create a virtual environment with python-3.10 and torch2.1.0 (compiled with cuda11+ as `flash_attn` package requires cuda11+)

In [1]:
!python --version

Python 3.10.12


In [2]:
import torch
torch.__version__, torch.cuda.is_available()

('2.1.0+cu121', True)

In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [4]:
!pip install -qU bitsandbytes wandb peft tqdm sentence-transformers pandas accelerate flash_attn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m91.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m95.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m17.

In [5]:
import pandas as pd
import numpy as np
import torch
import transformers
import bitsandbytes as bnb
import os
import wandb

from transformers import PreTrainedTokenizerFast, AdamW, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from tqdm import tqdm


## Data Preprocessing

In [6]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
%cd /content/drive/MyDrive/DACON_wallpaper_contest_data

/content/drive/MyDrive/DACON_wallpaper_contest_data


In [8]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig

In [26]:
# model_id = "OrionStarAI/Orion-14B-Chat"
# model_id = "AIFT/AIFT-instruct-dpo-v1.3-42dot_LLM-SFT-1.3B"
model_id = "OrionStarAI/Orion-14B-Chat-Int4"
model_id = "upstage/SOLAR-10.7B-Instruct-v1.0"
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

In [27]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, trust_remote_code=True, eos_token='</s>')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

### Designing Prompt Template:
Instresting finding is that we can directly find the prompt template underlining of each tokenizer.

In [30]:
# It seems a good idea to add custom tokens like <보고서> and <질문> to make a model be specific to our task

INSTRUCTION_PROMPT_TEMPLATE = """\
벽지에 대한 고객 문의에 정확하고 유용한 답변을 작성한다. <질문>의 의도를 파악하여 정확하게 <보고서>만을 기반으로 답변하세요.
<보고서>
{CONTEXT}
</보고서>
지침사항을 반드시 지키고, <보고서>를 기반으로 <질문>에 답변하세요.
<질문>
{QUESTION}
</질문>
"""

RESPONSE_TEMPLATE="""{ANSWER}"""

def create_instruction(context, question, answer, return_response=True):
  instruction_prompt = INSTRUCTION_PROMPT_TEMPLATE.format(CONTEXT=context, QUESTION=question)
  chat = [
    {"role": "user", "content": instruction_prompt},
    {"role": "assistant", "content": answer}
  ]
  prompt = tokenizer.apply_chat_template(chat, tokenize=False)
  return prompt



In [28]:
# model_id = "mistralai/Mistral-7B-Instruct-v0.2"
chat = [
  {"role": "user", "content": "Hello, how are you?"},
  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
  {"role": "user", "content": "I'd like to show off how chat templating works!"},
]

tokenizer.apply_chat_template(chat, tokenize=False)

"<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s>[INST] I'd like to show off how chat templating works! [/INST]"

In [18]:
# model_id = "upstage/SOLAR-10.7B-Instruct-v1.0"

chat = [
  {"role": "user", "content": "Hello, how are you?"},
  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
  {"role": "user", "content": "I'd like to show off how chat templating works!"},
]

tokenizer.apply_chat_template(chat, tokenize=False)

"### User:\nHello, how are you?\n\n### Assistant:\nI'm doing great. How can I help you today?### User:\nI'd like to show off how chat templating works!\n\n"

In [15]:
# model_id = "OrionStarAI/Orion-14B-Chat-Int4"

chat = [
  {"role": "user", "content": "Hello, how are you?"},
  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
  {"role": "user", "content": "I'd like to show off how chat templating works!"},
]

tokenizer.apply_chat_template(chat, tokenize=False)

"<|im_start|>user\nHello, how are you?<|im_end|>\n<|im_start|>assistant\nI'm doing great. How can I help you today?<|im_end|>\n<|im_start|>user\nI'd like to show off how chat templating works!<|im_end|>\n"

### Load Model and Fine

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)


base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={"":0},
    use_cache=False,
    trust_remote_code=True
)


Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


In [None]:
base_model

OrionForCausalLM(
  (model): OrionModel(
    (embed_tokens): Embedding(84608, 5120, padding_idx=0)
    (layers): ModuleList(
      (0-39): 40 x OrionDecoderLayer(
        (self_attn): OrionAttention(
          (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): OrionRotaryEmbedding()
        )
        (mlp): OrionMLP(
          (gate_proj): Linear4bit(in_features=5120, out_features=15360, bias=False)
          (up_proj): Linear4bit(in_features=5120, out_features=15360, bias=False)
          (down_proj): Linear4bit(in_features=15360, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
        (post_attention_layer

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


config = LoraConfig(
    # r=64, # FIXME
    # lora_alpha=128, # practically it is suggested that lora_alpha is set to twice of r
    r=8, # FIXME put 64 if we have high resource
    lora_alpha=16, # practically it is suggested that lora_alpha is set to twice of r
    # In the original LoRA paper, they only add adapters on query and value (attention weights), but QLoRA to target all layers of the attention modules plus mlp too
    target_modules=[
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


In [None]:
base_model.generation_config = GenerationConfig.from_pretrained(model_id)
base_model = prepare_model_for_kbit_training(base_model)
model = get_peft_model(base_model, config)
print_trainable_parameters(model)

You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


trainable params: 32768000 || all params: 7715727360 || trainable%: 0.42469100411552124


In [None]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): OrionForCausalLM(
      (model): OrionModel(
        (embed_tokens): Embedding(84608, 5120, padding_idx=0)
        (layers): ModuleList(
          (0-39): 40 x OrionDecoderLayer(
            (self_attn): OrionAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=5120, out_features=5120, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=5120, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer)

In [None]:
data = pd.read_csv('train.csv')
# max_length = 2048 # FIXME For Higher resource
max_length = 128

formatted_data = []
for _, row in tqdm(data.iterrows()):
  for q_col in ['질문_1', '질문_2']:
    for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
      input_text = row[q_col] + tokenizer.eos_token + row[a_col]
      input_ids = tokenizer.encode(input_text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_length)
      formatted_data.append(input_ids)
print('Done.')


644it [00:04, 156.63it/s]

Done.





In [None]:
formatted_data = torch.cat(formatted_data, dim=0)
formatted_data

tensor([[63806, 62044, 60987,  ...,     2,     2,     2],
        [63806, 62044, 60987,  ...,     2,     2,     2],
        [63806, 62044, 60987,  ...,     2,     2,     2],
        ...,
        [ 5676, 50369, 46986,  ..., 65305, 80820, 79634],
        [ 5676, 50369, 46986,  ..., 78887, 62778, 78231],
        [ 5676, 50369, 46986,  ..., 77394, 50376, 14338]])

## Model Fine-tuning

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'


trainer = transformers.Trainer(
    model=model,
    train_dataset=formatted_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        output_dir="orion-peft",
        optim="paged_adamw_8bit",
        # optim="paged_adamw_32bit", # FIXME FOR HIGHER RESOURCE
        save_strategy = "steps", # Change to `epoch` if we have enough resource
        save_steps = 50,
        # bf16=True, # FIXME
        # tf32=True, # FIXME
        max_grad_norm=0.3,
        warmup_ratio=0.03,
        lr_scheduler_type="constant",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()

You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.
[34m[1mwandb[0m: Currently logged in as: [33mbibekyess[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
10,2.1096
20,1.9252
30,1.8381
40,1.8132
50,1.8175
60,1.7151
70,1.5633
80,1.6551
90,1.745
100,1.6854




KeyboardInterrupt: 

In [None]:
# model.save_pretrained("demo_pretrained")


### Inference

In [None]:
from peft import PeftModel

model = PeftModel.from_pretrained(model, "./orion-peft/checkpoint-100")


In [None]:
messages = [{"role": "user", "content": "면진장치가 뭐야?"}]
response = model.chat(tokenizer, messages, streaming=False)
response

'면진장치는 지진이나 진동에 대처하기 위해 건물 벽면에 설치되는 장치입니다. 이 장치는 건물이 받는 충격을 최소화하고, 안전하게 유지할 수 있도록 도와줍니다. 면진장치는 건물의 벽면에 설치되며, 지진이나 진동이 발생할 때 건물이 받는 충격을 최소화하는 역할을 합니다. 이를 통해 건물의 안정성을 높이고, 안전한 환경을 유지할 수 있습니다.'

In [None]:
test = pd.read_csv('test.csv')

In [None]:
responses = []

for q_list in tqdm(test.질문):
    tmp_list = []
    # print(q_list, type(q_list))
    messages = [{"role": "user", "content": q_list}]
    response = model.chat(tokenizer, messages, streaming=False)
    responses.append(response)

responses

100%|██████████| 130/130 [59:49<00:00, 27.61s/it]


['방청 페인트는 부패방지용 페인트로, 목재의 부패를 방지하는 목적인 제품입니다. 대표적으로 오일 스테인, 래커, 외부용 빠데이크, 외부용 빠데임프라이머, 외부 빠데임 프라이머 써킷 등이 있습니다. 이 중에서도 외부 빠데임 프라이머가 가장 효과가 좋다고 합니다. 원목사이딩은 외관상 좋은 면이 있지만, 시간이 지나면 썩거나 변형될 가능성이 높습니다. 따라서, 원목사이딩을 사용할 경우, 정기적인 유지보수와 관리가 필요합니다.',
 '도배지에 녹은 자국이 생기는 주된 원인은 누수나 결로입니다. 이때, 벽면의 물기가 도배지로 스며들면서 녹아들어간 것입니다. 이를 해결하기 위해서는 먼저 누수나 결로가 발생하는 부분을 찾아 수리해야 합니다. 또한, 도배지를 바르기 전에 벽면의 물기를 완전히 제거하고, 적절한 방습제를 사용하는 것도 좋은 방법입니다. 이러한 조치를 취하면 도배지에 녹은 자국이 생기는 것을 예방할 수 있습니다.',
 '큐블럭은 미세한 먼지가 끼는 단점이 있습니다. 또한, 큐블럭의 가격이 비싸기 때문에 일반적으로 사용하기에는 부담스러울 수 있습니다. 반면, 압출법 단열판은 가격이 저렴하고 시공이 간편하며, 방음 및 단열 효과가 뛰어납니다. 따라서, 건축물의 외벽 등에 적용할 경우 큐블럭 대신 압출법 단열판을 사용하는 것이 더 적합합니다.',
 '고층 건물의 경우, 철골구조를 사용하게 되면 열의 전도율이 높아져서 단열 효과가 떨어질 수 있습니다. 이를 해결하기 위해 여러 가지 시공 방법이 있습니다. \n\n1. 외부에 단열재를 부착하는 방법: 이 방법은 외부 벽면에 단열재를 부착하는 것으로, 일반적으로 스티로폼 등의 단열재를 사용합니다. 이 방법은 비교적 간단하고 비용도 저렴하지만, 외부 벽면에 단열재가 부착되는 만큼 건물의 외관이 달라질 수 있습니다.\n\n2. 내부에 단열재를 설치하는 방법: 이 방법은 철골 구조물 내부에 단열재를 설치하는 것으로, 일반적으로 스티로폼 판넬 등을 사용합니다. 이 방법은 외부에 단열재를 부착하는 것보다 비용이 더 비싸지만, 건물

In [None]:
import json

json_data = json.dumps(responses)

# Write the JSON data to a file
with open('orion_responses.json', 'w') as json_file:
    json_file.write(json_data)

In [None]:
import json

# Open the JSON file in read mode
with open('orion_responses.json', 'r') as json_file:
    # Read the contents of the file
    json_data = json_file.read()

    # Parse the JSON data back into a Python data structure
    answer = json.loads(json_data)

len(answer)

130

In [None]:
answer

['방청 페인트는 부패방지용 페인트로, 목재의 부패를 방지하는 목적인 제품입니다. 대표적으로 오일 스테인, 래커, 외부용 빠데이크, 외부용 빠데임프라이머, 외부 빠데임 프라이머 써킷 등이 있습니다. 이 중에서도 외부 빠데임 프라이머가 가장 효과가 좋다고 합니다. 원목사이딩은 외관상 좋은 면이 있지만, 시간이 지나면 썩거나 변형될 가능성이 높습니다. 따라서, 원목사이딩을 사용할 경우, 정기적인 유지보수와 관리가 필요합니다.',
 '도배지에 녹은 자국이 생기는 주된 원인은 누수나 결로입니다. 이때, 벽면의 물기가 도배지로 스며들면서 녹아들어간 것입니다. 이를 해결하기 위해서는 먼저 누수나 결로가 발생하는 부분을 찾아 수리해야 합니다. 또한, 도배지를 바르기 전에 벽면의 물기를 완전히 제거하고, 적절한 방습제를 사용하는 것도 좋은 방법입니다. 이러한 조치를 취하면 도배지에 녹은 자국이 생기는 것을 예방할 수 있습니다.',
 '큐블럭은 미세한 먼지가 끼는 단점이 있습니다. 또한, 큐블럭의 가격이 비싸기 때문에 일반적으로 사용하기에는 부담스러울 수 있습니다. 반면, 압출법 단열판은 가격이 저렴하고 시공이 간편하며, 방음 및 단열 효과가 뛰어납니다. 따라서, 건축물의 외벽 등에 적용할 경우 큐블럭 대신 압출법 단열판을 사용하는 것이 더 적합합니다.',
 '고층 건물의 경우, 철골구조를 사용하게 되면 열의 전도율이 높아져서 단열 효과가 떨어질 수 있습니다. 이를 해결하기 위해 여러 가지 시공 방법이 있습니다. \n\n1. 외부에 단열재를 부착하는 방법: 이 방법은 외부 벽면에 단열재를 부착하는 것으로, 일반적으로 스티로폼 등의 단열재를 사용합니다. 이 방법은 비교적 간단하고 비용도 저렴하지만, 외부 벽면에 단열재가 부착되는 만큼 건물의 외관이 달라질 수 있습니다.\n\n2. 내부에 단열재를 설치하는 방법: 이 방법은 철골 구조물 내부에 단열재를 설치하는 것으로, 일반적으로 스티로폼 판넬 등을 사용합니다. 이 방법은 외부에 단열재를 부착하는 것보다 비용이 더 비싸지만, 건물

In [None]:
submission = pd.read_csv('sample_submission.csv')

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
embed_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/539M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

In [None]:
responses=answer

In [None]:
embed_model.encode(responses[0])

array([ 0.04215666,  0.11045343,  0.04767886, -0.02392781,  0.06406538,
       -0.02818174,  0.00837079,  0.00909889,  0.0279477 ,  0.03389404,
       -0.05770229, -0.03476072,  0.02434528, -0.10141925, -0.02403446,
       -0.0209629 , -0.03427393, -0.04615813, -0.05213957,  0.07176317,
       -0.01515151,  0.00805302, -0.02522971,  0.01409446,  0.01426401,
       -0.01419866, -0.010156  ,  0.01214009, -0.07226357,  0.00937402,
        0.0254471 , -0.01642957, -0.0054548 , -0.03169987,  0.00507798,
       -0.03502542,  0.04525568,  0.04163579,  0.03563558,  0.0545174 ,
       -0.02328358,  0.01358621, -0.0219509 , -0.00858987,  0.03821108,
       -0.03109843, -0.07930955, -0.00744583, -0.00142335, -0.01045187,
        0.03625108, -0.0213419 ,  0.05910727, -0.00422967,  0.04962244,
       -0.03068347, -0.03464219,  0.00957576, -0.04380086,  0.01181264,
       -0.02090469,  0.01569634,  0.03875672, -0.01699902, -0.0445503 ,
       -0.00558022, -0.04296635,  0.03800188, -0.00407329,  0.01

In [None]:
id = submission.id


submission.drop(['id'],axis = 1, inplace = True)

In [None]:
embed_model.encode(responses[0]).tolist()

[0.042156659066677094,
 0.11045343428850174,
 0.047678858041763306,
 -0.023927807807922363,
 0.06406538188457489,
 -0.028181742876768112,
 0.008370790630578995,
 0.009098888374865055,
 0.027947697788476944,
 0.033894043415784836,
 -0.057702288031578064,
 -0.034760721027851105,
 0.024345284327864647,
 -0.10141924768686295,
 -0.02403445914387703,
 -0.020962903276085854,
 -0.03427393361926079,
 -0.04615813493728638,
 -0.05213957279920578,
 0.07176316529512405,
 -0.015151510946452618,
 0.008053022436797619,
 -0.025229712948203087,
 0.0140944579616189,
 0.014264007098972797,
 -0.014198661781847477,
 -0.01015599723905325,
 0.012140091508626938,
 -0.07226356863975525,
 0.009374015033245087,
 0.025447100400924683,
 -0.016429569572210312,
 -0.0054548014886677265,
 -0.03169986978173256,
 0.005077981855720282,
 -0.03502541780471802,
 0.04525567963719368,
 0.04163578525185585,
 0.0356355756521225,
 0.05451739579439163,
 -0.023283576592803,
 0.01358620822429657,
 -0.021950898692011833,
 -0.00858986

In [None]:
for i in range(len(submission)):
    # print(i)
    try:
        submission.loc[i] = embed_model.encode(responses[i]).tolist()
    except:
        print(i)

In [None]:
total_submission = pd.DataFrame({'id': id})
total_submission

Unnamed: 0,id
0,TEST_000
1,TEST_001
2,TEST_002
3,TEST_003
4,TEST_004
...,...
125,TEST_125
126,TEST_126
127,TEST_127
128,TEST_128


In [None]:
submission

Unnamed: 0,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,vec_9,...,vec_502,vec_503,vec_504,vec_505,vec_506,vec_507,vec_508,vec_509,vec_510,vec_511
0,0.042157,0.110453,0.047679,-0.023928,0.064065,-0.028182,0.008371,0.009099,0.027948,0.033894,...,-0.024371,-0.049183,0.046511,-0.049165,-0.007734,0.029142,0.020305,-0.000718,-0.009978,0.040244
1,-0.017632,0.012751,-0.024733,0.025530,0.093969,-0.010829,0.030211,-0.044759,-0.028521,0.013528,...,-0.041069,-0.035862,0.016926,-0.025517,-0.030209,0.040165,-0.004050,-0.011119,0.000948,0.016093
2,-0.007298,-0.046221,-0.038921,-0.017196,0.130244,-0.039105,0.020284,-0.015590,0.064194,0.041591,...,-0.039106,0.020225,0.068059,-0.033572,0.002280,-0.029602,-0.029154,-0.018547,-0.036767,0.101903
3,-0.028322,-0.011159,-0.003201,0.017152,0.070116,-0.094935,-0.039363,-0.014712,-0.018029,0.013406,...,-0.018566,-0.019476,0.053251,-0.040362,0.004668,0.028740,-0.016223,-0.075514,-0.022465,0.079249
4,0.025546,-0.002228,-0.022075,-0.068469,0.109451,-0.029035,0.069532,0.063616,0.014408,0.002609,...,-0.000326,-0.012158,0.008391,0.015916,-0.011787,0.003652,-0.002448,0.060200,-0.021662,0.070612
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,-0.013936,-0.005947,0.045813,-0.011203,0.125854,-0.010155,0.052437,0.052894,0.044627,0.023029,...,-0.001679,0.048596,0.025770,-0.022390,0.046235,0.011444,0.066517,-0.069829,0.011634,0.025972
126,0.011456,-0.036085,-0.044864,-0.006485,0.109058,0.001157,0.008894,0.030218,0.013328,0.023062,...,-0.045524,-0.036997,0.052447,-0.030193,-0.006129,0.020568,-0.003103,-0.016428,0.007753,0.061514
127,-0.010161,0.008079,-0.009164,0.040240,0.081483,-0.030591,0.007630,-0.026661,-0.049713,0.020585,...,-0.021081,-0.096137,0.029437,-0.050750,-0.024995,0.058477,0.050730,0.026604,-0.007136,-0.012916
128,0.029554,0.024604,-0.094784,-0.015168,0.077939,-0.049938,-0.003080,-0.022820,0.043833,0.036563,...,-0.043556,0.036434,-0.020236,-0.006427,-0.024800,0.047569,-0.088198,0.020790,0.026581,-0.061856


In [None]:
sub = pd.concat([total_submission, submission] , axis = 1)

In [None]:
sub.to_csv('orion_submission.csv', index = False) #Rename