In [1]:
import os
import gc
import re
from time import time
import random
import warnings
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from accelerate import PartialState
from datasets import Dataset, DatasetDict

import torch
import transformers
from transformers import AutoTokenizer, LlamaModel, AutoModelForSequenceClassification
import torch.nn.functional as F
from huggingface_hub import login
tqdm.pandas()

# Change the working directory to the directory containing the script
os.chdir("/group-volume/binfeng/wsdm/stage_ft")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_PATH = "/group-volume/binfeng/wsdm/ckpt/qwen32b_ppt/checkpoint-1973"
MAX_LENGTH = 2000
MAX_PROMPT_LENGTH = 400

## Tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

## Prepare Data

In [4]:
from utils import *
from sklearn.model_selection import StratifiedKFold

data = pd.read_csv("/user-volume/bx/kaggle48k.csv")
data.dropna(inplace=True)

## REVERSED!
data["text"] = data.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH,
                                                reverse=True), axis=1)
data["label"] = data.apply(lambda x: format_label(x.winner, 
                                                  reverse=True), axis=1)
print(data["label"].nunique())

skf = StratifiedKFold(n_splits=100, shuffle=True, random_state=1)
for train_index, val_index in skf.split(data, data["language"]):
    data_train, data_val = data.iloc[train_index], data.iloc[val_index]
    print(len(data_train), len(data_val))
    break

2
47952 485




## Dataloader

In [5]:
def tokenizer_func(example):
    # Tokenize the input
    tokenized = tokenizer(
        example["text"], 
        padding='max_length', 
        max_length=MAX_LENGTH,
        truncation=True,
        return_tensors='np'
    )
    
    input_ids = tokenized['input_ids']
    attention_mask = tokenized['attention_mask']
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask
    }


train_dataset = Dataset.from_pandas(data_train[["text", "label"]])
val_dataset = Dataset.from_pandas(data_val[["text", "label"]])
raw_dataset = DatasetDict({
    'kaggle48k_train': train_dataset,
    'kaggle48k_val': val_dataset
})

tokenized_dataset = raw_dataset.map(tokenizer_func, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.remove_columns(['__index_level_0__'])
tokenized_dataset


Map: 100%|██████████| 47952/47952 [00:33<00:00, 1428.17 examples/s]
Map: 100%|██████████| 485/485 [00:00<00:00, 1406.24 examples/s]


DatasetDict({
    kaggle48k_train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 47952
    })
    kaggle48k_val: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 485
    })
})

In [7]:
i = 1
print(tokenizer.decode(tokenized_dataset["kaggle48k_val"][i]["input_ids"], skip_special_tokens=False))
print("**label:", tokenized_dataset["kaggle48k_val"][i]["labels"])

## User Prompt
When facebook will drop their new smart glasses

## Response A
As of my last update in October 2023, there hasn't been an official announcement from Meta (formerly known as Facebook) regarding the exact release date for their next generation of smart glasses. However, here's what we know based on previous releases and industry trends:

1. **Previous Releases**: 
   - Meta, in collaboration with Ray-Ban, released their first smart glasses, Ray-Ban Stories, in September 2021. These glasses featured built-in cameras, speakers, and microphones, allowing users to take photos, record videos, and make calls.

2. **Development Cycle**: 
   - Tech companies typically follow a cycle for product releases. If Meta follows a similar pattern to their first release, we might expect an announcement or release around late 2023 to early 2024. However, this is speculative.

3. **Rumors and Leaks**: 
   - There have been rumors and patents filed by Meta that suggest they are working on more

In [8]:
tokenized_dataset.save_to_disk("/group-volume/binfeng/wsdm/data/tokenized_qwen32b_ft")

Saving the dataset (2/2 shards): 100%|██████████| 47952/47952 [00:00<00:00, 77548.46 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 485/485 [00:00<00:00, 52745.55 examples/s]


In [10]:
tokenized_dataset["kaggle48k_train"].features

{'text': Value(dtype='string', id=None),
 'labels': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}