In [1]:
import os
from time import time
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from huggingface_hub import login
tqdm.pandas()

# Change the working directory to the directory containing the script
os.chdir("/group-volume/binfeng/wsdm/stage_distill")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_PATH = "microsoft/phi-4"
MAX_LENGTH = 2000
MAX_PROMPT_LENGTH = 400

## Tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.save_pretrained("/group-volume/binfeng/wsdm/tokenizer/phi4")

('/group-volume/binfeng/wsdm/tokenizer/phi4/tokenizer_config.json',
 '/group-volume/binfeng/wsdm/tokenizer/phi4/special_tokens_map.json',
 '/group-volume/binfeng/wsdm/tokenizer/phi4/vocab.json',
 '/group-volume/binfeng/wsdm/tokenizer/phi4/merges.txt',
 '/group-volume/binfeng/wsdm/tokenizer/phi4/added_tokens.json',
 '/group-volume/binfeng/wsdm/tokenizer/phi4/tokenizer.json')

## Prepare Data

In [4]:


ft = pd.read_parquet("/group-volume/binfeng/wsdm/stage_distill/datap/ft48k.parquet")
ft.dropna(inplace=True)
ft["text"] = ft.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH), axis=1)
ft["label"] = ft.apply(lambda x: format_label(x.winner), axis=1)

skf = StratifiedKFold(n_splits=100, shuffle=True, random_state=10)
for train_index, val_index in skf.split(ft, ft["language"]):
    ft_train, ft_val = ft.iloc[train_index], ft.iloc[val_index]
    print(len(ft_train), len(ft_val))
    break


Token indices sequence length is longer than the specified maximum sequence length for this model (18891 > 16384). Running this sequence through the model will result in indexing errors


47952 485




In [5]:
ppt = pd.read_parquet("/group-volume/binfeng/wsdm/stage_distill/datap/ppt135k.parquet")
ppt.dropna(inplace=True)
ppt["text"] = ppt.apply(lambda x: format_text(tokenizer, x.prompt, x.response_a, x.response_b, 
                                                max_len=MAX_LENGTH, max_prompt_len=MAX_PROMPT_LENGTH), axis=1)
ppt["label"] = ppt.apply(lambda x: format_label(x.winner), axis=1)

kf = KFold(n_splits=1000, shuffle=True, random_state=10)
for train_index, val_index in kf.split(ppt):
    ppt_train, ppt_val = ppt.iloc[train_index], ppt.iloc[val_index]
    print(len(ppt_train), len(ppt_val))
    break


135588 136


## Dataloader

In [9]:
def tokenizer_func(example):
    return tokenizer(
        example["text"], 
        padding='max_length', 
        max_length=MAX_LENGTH,
        truncation=True,
        return_tensors='np'
    )


ppt_train_dataset = Dataset.from_pandas(ppt_train[["text", "label", "logits_qwencd", "logits_qwen32"]])
ppt_val_dataset = Dataset.from_pandas(ppt_val[["text", "label", "logits_qwencd", "logits_qwen32"]])
ft_train_dataset = Dataset.from_pandas(ft_train[["text", "label", "logits_qwencd", "logits_qwen32"]])
ft_val_dataset = Dataset.from_pandas(ft_val[["text", "label", "logits_qwencd", "logits_qwen32"]])
raw_dataset = DatasetDict({
    'ppt135k_train': ppt_train_dataset,
    'ppt135k_val': ppt_val_dataset,
    'ft48k_train': ft_train_dataset,
    'ft48k_val': ft_val_dataset,
})

tokenized_dataset = raw_dataset.map(tokenizer_func, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.remove_columns(['__index_level_0__', 'text'])
tokenized_dataset


DatasetDict({
    ppt135k_train: Dataset({
        features: ['labels', 'logits_qwencd', 'logits_qwen32', 'input_ids', 'attention_mask'],
        num_rows: 135588
    })
    ppt135k_val: Dataset({
        features: ['labels', 'logits_qwencd', 'logits_qwen32', 'input_ids', 'attention_mask'],
        num_rows: 136
    })
    ft48k_train: Dataset({
        features: ['labels', 'logits_qwencd', 'logits_qwen32', 'input_ids', 'attention_mask'],
        num_rows: 47952
    })
    ft48k_val: Dataset({
        features: ['labels', 'logits_qwencd', 'logits_qwen32', 'input_ids', 'attention_mask'],
        num_rows: 485
    })
})

In [10]:
tokenized_dataset["ft48k_train"]["logits_qwen32"][0]

[1.9086914, -1.5205078]

In [11]:
i = 6
print(tokenizer.decode(tokenized_dataset["ft48k_val"][i]["input_ids"], skip_special_tokens=False))
print("**label:", tokenized_dataset["ft48k_val"][i]["labels"])

<|User Prompt|>
Je fait un projet en Symfony et je développe actuellement une commande qui prend en paramètre un path vers un CSV puis qui lis ce csv puis deserialize en objet que j'incère ensuite en base de donnée.
Ajoute le fait que si les 4 champs sont nuls alors nne pas ajouter en base, utilise la commande déjà existante avec les éléments suivants :

```php
<?php

namespace App\Entity;

use App\Repository\FolderRepository;
use Doctrine\DBAL\Types\Types;
use Doctrine\ORM\Mapping as ORM;

#[ORM\Entity(repositoryClass: FolderRepository::class)]
class Folder
{
    #[ORM\Id]
    #[ORM\GeneratedValue]
    #[ORM\Column]
    private ?int $id = null;

    #[ORM\Column(length: 255, nullable: true)]
    private ?string $urbaPermissionNumber = null;

    #[ORM\Column(length: 255, nullable: true)]
    private ?string $petitionerName = null;

    #[ORM\Column(length: 255, nullable: true)]
    private ?string $addressOfWorks = null;

    #[ORM\Column(type: Types::TEXT, nullable: true)]
    privat

In [13]:
tokenized_dataset.save_to_disk("/group-volume/binfeng/wsdm/data/tokenized_phi4_distill")

Saving the dataset (3/3 shards): 100%|██████████| 135588/135588 [00:01<00:00, 121773.25 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 136/136 [00:00<00:00, 19709.94 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 47952/47952 [00:00<00:00, 160534.19 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 485/485 [00:00<00:00, 63857.28 examples/s]


In [14]:
tokenized_dataset["ppt135k_val"]

Dataset({
    features: ['labels', 'logits_qwencd', 'logits_qwen32', 'input_ids', 'attention_mask'],
    num_rows: 136
})