In [2]:
import os
import gc
import re
from time import time
import random
import warnings
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from accelerate import PartialState
from datasets import Dataset, DatasetDict

import torch
import transformers
from transformers import AutoTokenizer, LlamaModel, AutoModelForSequenceClassification
import torch.nn.functional as F
from huggingface_hub import login
tqdm.pandas()

# Change the working directory to the directory containing the script
os.chdir("/group-volume/binfeng/wsdm/stage_ft")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
MODEL_PATH = "/group-volume/binfeng/wsdm/ckpt/sky27b_pptsmall/checkpoint-815"
MAX_LENGTH = 2000
MAX_PROMPT_LENGTH = 400

## Tokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# tokenizer.save_pretrained("/group-volume/binfeng/wsdm/tokenizer/sky27b")

## Prepare Data

In [8]:
from utils import *
from sklearn.model_selection import StratifiedKFold

data = pd.read_csv("/user-volume/bx/kaggle48k.csv")
data.dropna(inplace=True)

## REVERSED!
data["text"] = data.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH,
                                                reverse=True), axis=1)
data["label"] = data.apply(lambda x: format_label(x.winner, 
                                                  reverse=True), axis=1)
print(data["label"].nunique())

skf = StratifiedKFold(n_splits=100, shuffle=True, random_state=55)
for train_index, val_index in skf.split(data, data["language"]):
    data_train, data_val = data.iloc[train_index], data.iloc[val_index]
    print(len(data_train), len(data_val))
    break

Token indices sequence length is longer than the specified maximum sequence length for this model (9770 > 4096). Running this sequence through the model will result in indexing errors


2
47952 485




## Dataloader

In [1]:
# def tokenizer_func(example):
#     # Tokenize the input
#     tokenized = tokenizer(
#         example["text"], 
#         padding='max_length', 
#         max_length=MAX_LENGTH,
#         truncation=True,
#         return_tensors='np'
#     )
    
#     input_ids = tokenized['input_ids']
#     attention_mask = tokenized['attention_mask']
    
#     return {
#         'input_ids': input_ids,
#         'attention_mask': attention_mask
#     }
    
def tokenizer_func(examples):
    return tokenizer(examples["text"], padding="max_length", max_length=MAX_LENGTH, truncation=True)

In [11]:
train_dataset = Dataset.from_pandas(data_train[["text", "label"]])
val_dataset = Dataset.from_pandas(data_val[["text", "label"]])
raw_dataset = DatasetDict({
    'kaggle48k_train': train_dataset,
    'kaggle48k_val': val_dataset
})

tokenized_dataset = raw_dataset.map(tokenizer_func, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.remove_columns(['__index_level_0__'])
tokenized_dataset


Map: 100%|██████████| 47952/47952 [00:34<00:00, 1389.95 examples/s]
Map: 100%|██████████| 485/485 [00:00<00:00, 1324.62 examples/s]


DatasetDict({
    kaggle48k_train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 47952
    })
    kaggle48k_val: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 485
    })
})

In [13]:
i = 1
print(tokenizer.decode(tokenized_dataset["kaggle48k_val"][i]["input_ids"], skip_special_tokens=False))
print("**label:", tokenized_dataset["kaggle48k_val"][i]["labels"])

<bos>## User Prompt
Where does elastic beanstalk store the environment variables for an environment? Give me the file path for Amazon Linux 2023

## Response A
In Amazon Linux 2023, AWS Elastic Beanstalk stores the environment variables for an environment in the following file:

```
/opt/elasticbeanstalk/deployment/env
```

This file contains all the environment variables that are set for your Elastic Beanstalk environment. You can access this file to inspect the environment variables that have been configured for your application.

## Response B
In Amazon Elastic Beanstalk (EB) on Amazon Linux 2 (which includes Amazon Linux 2023), environment variables can be stored and managed in several ways, but the primary locations where they are stored are:

1. **`.ebextensions` Configuration Files**: These are YAML or JSON files that you place in your application's root directory. Environment variables defined here are persisted across deployments and are applied at the environment level. The s

In [14]:
tokenized_dataset.save_to_disk("/group-volume/binfeng/wsdm/data/tokenized_sky27b_ft")

Saving the dataset (2/2 shards): 100%|██████████| 47952/47952 [00:00<00:00, 72728.23 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 485/485 [00:00<00:00, 50722.79 examples/s]


In [6]:
import torch

log_probs = torch.log_softmax(torch.torch.randn(2,3,5), dim=-1)
log_probs

tensor([[[-3.2498, -1.4825, -0.6032, -2.2185, -2.5468],
         [-2.1985, -2.3299, -1.5729, -1.2161, -1.2451],
         [-2.2972, -3.1037, -2.6995, -1.1984, -0.7222]],

        [[-2.2661, -2.3434, -2.1984, -1.1783, -0.9636],
         [-1.5325, -1.0846, -2.5868, -1.1091, -3.1976],
         [-2.3467, -1.5876, -1.9824, -2.8244, -0.6875]]])

In [9]:
target_dist = torch.ones_like(log_probs) / log_probs.size(-1)  # Shape: (2, 3, 5)
target_log_dist = torch.log(target_dist)
target_log_dist

tensor([[[-1.6094, -1.6094, -1.6094, -1.6094, -1.6094],
         [-1.6094, -1.6094, -1.6094, -1.6094, -1.6094],
         [-1.6094, -1.6094, -1.6094, -1.6094, -1.6094]],

        [[-1.6094, -1.6094, -1.6094, -1.6094, -1.6094],
         [-1.6094, -1.6094, -1.6094, -1.6094, -1.6094],
         [-1.6094, -1.6094, -1.6094, -1.6094, -1.6094]]])

In [11]:
target_dist * (target_log_dist - log_probs)

tensor([[[ 0.3281, -0.0254, -0.2012,  0.1218,  0.1875],
         [ 0.1178,  0.1441, -0.0073, -0.0787, -0.0729],
         [ 0.1375,  0.2988,  0.2180, -0.0822, -0.1774]],

        [[ 0.1313,  0.1468,  0.1178, -0.0862, -0.1292],
         [-0.0154, -0.1050,  0.1955, -0.1001,  0.3176],
         [ 0.1474, -0.0044,  0.0746,  0.2430, -0.1844]]])